In [1]:
#Loading the packages required: 
from tensorflow.keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply, Softmax, Embedding
from tensorflow.keras.layers import RepeatVector, Dense, Activation, Lambda
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.activations import softmax
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import load_model, Model
from tensorflow import  reshape, shape, transpose, ones, linalg
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

import numpy as np
from faker import Faker
import random
from tqdm import tqdm
from babel.dates import format_date
from nmt_utils import *
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd



from tensorflow.keras.layers import MultiHeadAttention, Dense, Dropout, LayerNormalization, Layer
from tensorflow.keras.models import Sequential
from tensorflow import  reshape, shape, transpose, ones, linalg
from sklearn.model_selection import train_test_split 
from nltk.tokenize import word_tokenize, RegexpTokenizer
import re

In [2]:
#Loading the dataset 
m = 10000 #number of training samples 
dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dataset(m)

100%|█████████████████████████████████| 10000/10000 [00:00<00:00, 122450.35it/s]


In [3]:
dataset[0:10] 

[('29 oct 1992', '1992-10-29'),
 ('23.07.70', '1970-07-23'),
 ('3/19/15', '2015-03-19'),
 ('tuesday may 13 1986', '1986-05-13'),
 ('friday march 9 1990', '1990-03-09'),
 ('monday august 11 1980', '1980-08-11'),
 ('thursday january 4 2001', '2001-01-04'),
 ('10 nov 1978', '1978-11-10'),
 ('22 oct 1976', '1976-10-22'),
 ('monday september 20 1993', '1993-09-20')]

In [4]:
def preprocess_data(X,Y, human_vocab, machine_vocab, Ty, Tx):

    #Convert each date into a vector of integers corresponding to its index in human_vocab (for X) or machine-vocab (for Y): 
    X = np.array([string_to_int(i, Tx, human_vocab) for i in X])
    Y = [string_to_int(t, Ty, machine_vocab) for t in Y]
    
    # Add the <sos> and <end> tokens to Y: 
    Y = [[11] + i + [12] for i in Y]

    #Create the one-hot vectors (will be used as input) 
    #Xoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(human_vocab)), X))) #one-hot vector of each X element
    #Yoh = np.array(list(map(lambda x: to_categorical(x, num_classes=len(machine_vocab)), Y)))

    return X, np.array(Y)#, Xoh, Yoh

In [5]:
machine_vocab = machine_vocab | {'<sos>':11, '<end>':12}
machine_vocab

{'-': 0,
 '0': 1,
 '1': 2,
 '2': 3,
 '3': 4,
 '4': 5,
 '5': 6,
 '6': 7,
 '7': 8,
 '8': 9,
 '9': 10,
 '<sos>': 11,
 '<end>': 12}

In [6]:
del human_vocab['<pad>']
human_vocab = {key: value + 1 for key, value in human_vocab.items()}
human_vocab = {'<pad>':0} | human_vocab | {'arb': 37} # adding this since for the positional embeddings we need even values 
human_vocab

{'<pad>': 0,
 ' ': 1,
 '.': 2,
 '/': 3,
 '0': 4,
 '1': 5,
 '2': 6,
 '3': 7,
 '4': 8,
 '5': 9,
 '6': 10,
 '7': 11,
 '8': 12,
 '9': 13,
 'a': 14,
 'b': 15,
 'c': 16,
 'd': 17,
 'e': 18,
 'f': 19,
 'g': 20,
 'h': 21,
 'i': 22,
 'j': 23,
 'l': 24,
 'm': 25,
 'n': 26,
 'o': 27,
 'p': 28,
 'r': 29,
 's': 30,
 't': 31,
 'u': 32,
 'v': 33,
 'w': 34,
 'y': 35,
 '<unk>': 36,
 'arb': 37}

In [7]:
# Unlist the tuples; seperate and save the human-readable and machine-readable dates into X and Y respectively. 
X, Y = zip(*dataset)
Tx = len(max(X, key=len))

In [8]:
reverse_vocabulary = {index: word for word, index in machine_vocab.items()}
print(reverse_vocabulary)
def reverse_lookup(indices):
    return [reverse_vocabulary[index] for index in indices if index in reverse_vocabulary]
reverse_lookup([10])


{0: '-', 1: '0', 2: '1', 3: '2', 4: '3', 5: '4', 6: '5', 7: '6', 8: '7', 9: '8', 10: '9', 11: '<sos>', 12: '<end>'}


['9']

### Split the dataset to training and testing sets: 

In [9]:
#dividing the dataset into 75% training set and 25% test set: 
X_train, X_test, y_train, y_test = train_test_split(X,Y, 
                                   random_state=104,  
                                   test_size=0.25,  
                                   shuffle=True) 


In [10]:
X_train[0]

'23 april 2006'

In [11]:
y_train[0:3]

['2006-04-23', '1980-02-16', '1997-12-08']

In [12]:
Ty = 10 
X_trainmod, y_trainmod = preprocess_data(X_train,y_train, human_vocab, machine_vocab, Ty, 27)
X_testmod, y_testmod = preprocess_data(X_test, y_test, human_vocab, machine_vocab, Ty, 27) 

In [13]:
X_trainmod[0]

array([ 6,  7,  1, 14, 28, 29, 22, 24,  1,  6,  4,  4, 10,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [14]:
y_trainmod[0]

array([11,  3,  1,  1,  7,  0,  1,  5,  0,  3,  4, 12])

In [15]:
print("X.shape:", X_trainmod.shape)
print("Y.shape:", y_trainmod.shape)
#print("Xoh.shape:", Xoh.shape)
#print("Yoh.shape:", Yoh.shape)

X.shape: (7500, 27)
Y.shape: (7500, 12)


In [16]:
print(f"First element of X_train is :\n{X_train[0]}")
print(f"First element of y_train is :\n{y_train[0]}")
print(f"First encoding element of y_trainmod is :\n{y_trainmod[0]}")
print(f"Second encoding element of y_trainmod: \n{y_trainmod[1]}")
print(f"First integer encoding of X_train: \n{X_trainmod[0]}")

First element of X_train is :
23 april 2006
First element of y_train is :
2006-04-23
First encoding element of y_trainmod is :
[11  3  1  1  7  0  1  5  0  3  4 12]
Second encoding element of y_trainmod: 
[11  2 10  9  1  0  1  3  0  2  7 12]
First integer encoding of X_train: 
[ 6  7  1 14 28 29 22 24  1  6  4  4 10  0  0  0  0  0  0  0  0  0  0  0
  0  0  0]


In [17]:
X_trainmod.shape

(7500, 27)

In [18]:
# TEST/ Define the embedding layer for the input of the encoder (human-readible dates)
len_human_vocab_ = len(human_vocab)  # Example vocabulary size
embedding_dim = 20  # Embedding dimension

# Define an embedding layer
Encoding_embedding = tf.keras.layers.Embedding(input_dim=len_human_vocab_, output_dim=embedding_dim)

# Example usage
input_indices = X_trainmod
input_encoder = Encoding_embedding(input_indices)
input_encoder.shape 

TensorShape([7500, 27, 20])

In [19]:
input_encoder[0][-1] # this is the encoding for the last string of the first sample 

<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([-0.00577899,  0.01323602,  0.04087318, -0.02204746,  0.04948512,
        0.00802944, -0.00022287,  0.03216836,  0.01461811, -0.03330668,
       -0.01023803,  0.00774448,  0.00988574, -0.02216876,  0.02318745,
       -0.01249168,  0.04581759,  0.04978187,  0.04951015, -0.00880697],
      dtype=float32)>

In [20]:
# TEST/ Define the embedding layer for the input of the encoder (human-readible dates)
len_machine_vocab = len(machine_vocab)  # Example vocabulary size
embedding_dim = 20  # Embedding dimension

# Define an embedding layer
Encoding_embedding = tf.keras.layers.Embedding(input_dim=len_machine_vocab, output_dim=embedding_dim)

# Example usage
input_indices = y_trainmod
input_decoder = Encoding_embedding(input_indices)
print(input_decoder.shape)
print(input_decoder[0][0])
input_decoder[1][0] #same embedding for the start token in all samples 

(7500, 12, 20)
tf.Tensor(
[ 0.00802339  0.03491603 -0.03945272  0.01754576 -0.00411788 -0.02898422
  0.03989638  0.04164628 -0.02419833 -0.00774594 -0.02297484  0.02806025
 -0.01405926  0.0110396  -0.01564211 -0.02295054  0.01978591  0.02532512
  0.02784434  0.01005604], shape=(20,), dtype=float32)


<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([ 0.00802339,  0.03491603, -0.03945272,  0.01754576, -0.00411788,
       -0.02898422,  0.03989638,  0.04164628, -0.02419833, -0.00774594,
       -0.02297484,  0.02806025, -0.01405926,  0.0110396 , -0.01564211,
       -0.02295054,  0.01978591,  0.02532512,  0.02784434,  0.01005604],
      dtype=float32)>

- To the machine vocabulary, we have added two tokens $<sos>$ and $<end>$. The decoder is now able to predict the end of the sequence.  
- The length is no longer chosen but set as the length of the longest string among the inputs. 

#### Add positional encodings: 

In [21]:
# Calculate the angles for positional embeddings: 

def get_angles(pos, k, d):
    """
    Get the angles for the positional encoding
    
    Arguments:
        pos -- Column vector containing the positions [[0], [1], ...,[N-1]]
        k --   Row vector containing the dimension span [[0, 1, 2, ..., d-1]]
        d(integer) -- Encoding size
    
    Returns:
        angles -- (pos, d) numpy array 
    """
    
    # Get i from dimension span k
    i = k//2
    # Calculate the angles using pos, i and d
    angles = pos/ (10000)**(2*i/d)

    
    return angles
    
def pos_emb(len_seq,len_emb): 
    
    """
    This function creates the positional embeddings for all the words in the sequence based on: 
    
    Input: 
    len_seq (int) : The length of the sequences inputed into the model. 
    len_emb (int) : The length of the word embeddings for every word in the sequence. 

    Note: the size of the positional encoding and the word embeddings must match in order to add them in the next step. 

    Output: 
    res (np.array(len_seq, len_emb)) : ith row of this matrix represents the positional encodings for the ith position in the sequence. 

    """

    len_i = int(len_emb/2)

    # Initialize the matrix to save positional encodings: 
    res = np.zeros((len_seq,len_emb))
    angles = np.zeros((len_seq,len_emb))
    
    #for each position in the sequence 
    for pos in range(len_seq): #there are 30 words so position ranges between 0-29
        
        #calculate the angles: 
        for i in range(len_i): #ranges between 0 - 24
            angles[pos,2*i] = pos/(10000**(2*i/len_emb))
            angles[pos, 2*i +1] = pos/(10000**(2*i/len_emb)) 
        
        # Calculate the entries corresponding to each position 
        #for j in range(len_i): 
        res[pos, 0::2] = np.sin(angles[pos,0::2])
        res[pos,1::2] = np.cos(angles[pos,0::2])
            
    return(tf.cast(res.reshape(1,len_seq,len_emb), dtype=tf.float32))


In [22]:
# TEST/ Add positional encodings to the input of the encoder: 
pos_encoding_en = pos_emb(27,20)
input_encoder = input_encoder + pos_encoding_en


In [23]:
input_encoder.shape

TensorShape([7500, 27, 20])

In [24]:
# TEST/ Add positional encodings to the input of the decoder: 
pos_encoding_dec = pos_emb(12,20)
input_decoder = input_decoder + pos_encoding_dec

In [25]:
input_decoder.shape

TensorShape([7500, 12, 20])

#### Define the masks: 

##### Padding mask

In [26]:
def create_padding_mask(matrix,num_heads):
    """
    Creates a matrix mask for the padding cells
    
    Arguments:
        seq -- (n, m) matrix
    
    Returns:
        mask -- (n, 1, 1, m) binary tensor
    """
    # Check if each row is all zeros
    zero_rows = tf.cast(tf.equal(matrix, 0), dtype=tf.float32)

    padded_matrix_1 = tf.repeat(tf.expand_dims(zero_rows, axis=1), repeats=num_heads, axis=1)
    final_mask = tf.cast(tf.expand_dims(padded_matrix_1, axis=2),dtype=tf.float32)
    # Convert boolean array to integer array (0s and 1s)
    #padded_mask = zero_rows.astype(int)
    # Expand to make 4D: 
    #expanded_padding_mask_init = tf.expand_dims(padded_mask, axis=1)
    #expanded_padding_mask_final = tf.expand_dims(expanded_padding_mask_init, axis=1)
    # Repeat for each head: 
    #final_mask = tf.cast(tf.tile(expanded_padding_mask_final, [1, num_heads, 1, 1]),tf.float32)  # (batch_size, num_heads, 1, seq_len)

    return final_mask

#### Example: 

When we have an input, we want to make sure the zeros get mapped to zero attention. Let us see a sample 

In [27]:
X_trainmod[0]# so all the way after the 10th token, we need padding.

array([ 6,  7,  1, 14, 28, 29, 22, 24,  1,  6,  4,  4, 10,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [238]:
padding_mask = create_padding_mask(X_trainmod[0:1], 2) 
padding_mask.shape #come back to this. 

TensorShape([1, 2, 1, 27])

In [31]:
padding_mask

<tf.Tensor: shape=(1, 2, 1, 27), dtype=float32, numpy=
array([[[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]],

        [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
          1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]],
      dtype=float32)>

In [285]:
# Define Query, Key, and Value matrices
dense_q = tf.cast(Dense(units = 40)(input_encoder[0:1]), dtype=tf.float32) # shape = (#samples, len_seq, dim_q)
dense_k = tf.cast(Dense(units = 40)(input_encoder[0:1]),dtype = tf.float32) # shape = (#samples, len_seq, dim_k) 
dense_v = tf.cast(Dense(units = 40)(input_encoder[0:1]), dtype = tf.float32) # shape = (#samples, len_seq, dim_v) 
# Reshape the Query, Key, and Value matrices 
dense_qre = reshape_tensor(dense_q, 2, pre_attention = True) #shape = (#samples, #heads, dim_q/heads, len_seq)
dense_kre = reshape_tensor(dense_k, 2, pre_attention = True) #shape = (#samples, #heads, dim_k/heads, len_seq)
dense_vre = reshape_tensor(dense_v, 2, pre_attention = True)
# Calculate the attention scores
attention_scores, res = self_attention(dense_qre,dense_kre,dense_vre, masking = padding_mask)
attention_scores.shape 

TensorShape([1, 2, 27, 27])

In [286]:
attention_scores[0][0][0] #This is the attention scores for the first sample, first head, first word. 

<tf.Tensor: shape=(27,), dtype=float32, numpy=
array([0.06970534, 0.07402434, 0.07621909, 0.07359982, 0.07238382,
       0.07415678, 0.07832458, 0.0819699 , 0.08164559, 0.07979792,
       0.07662789, 0.07788742, 0.08365756, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        ], dtype=float32)>

### Look-Ahead Mask 

In [34]:
def create_look_ahead_mask(dim): 
    
    """
    At each iteration of the decoder making predictions, pass the length of the input (dim) to this function to mask the proceeding words
    
    """
    # keeps the main diagonal and all sub-diagonals and sets all super-diagonals to zero: 
    mask = 1 - linalg.band_part(ones((dim, dim)), -1, 0) 
    expanded_mask_init = tf.expand_dims(mask, axis = 0) #(1,len_seq, len_seq) 
    expanded_mask_final = tf.expand_dims(expanded_mask_init, axis = 0)
 
    return expanded_mask_final

In [283]:
look_ahead_mask = create_look_ahead_mask(10)
look_ahead_mask

<tf.Tensor: shape=(1, 1, 10, 10), dtype=float32, numpy=
array([[[[0., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
         [0., 0., 1., 1., 1., 1., 1., 1., 1., 1.],
         [0., 0., 0., 1., 1., 1., 1., 1., 1., 1.],
         [0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
         [0., 0., 0., 0., 0., 1., 1., 1., 1., 1.],
         [0., 0., 0., 0., 0., 0., 1., 1., 1., 1.],
         [0., 0., 0., 0., 0., 0., 0., 1., 1., 1.],
         [0., 0., 0., 0., 0., 0., 0., 0., 1., 1.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]]], dtype=float32)>

#### Example Look-ahead mask 

In [193]:
# Define the Q,K,V 
dense_q = tf.cast(Dense(units = 40)(input_encoder[0:1]), dtype=tf.float32) # shape = (#samples, len_seq, dim_q)
dense_k = tf.cast(Dense(units = 40)(input_encoder[0:1]),dtype = tf.float32) # shape = (#samples, len_seq, dim_k) 
dense_v = tf.cast(Dense(units = 40)(input_encoder[0:1]), dtype = tf.float32) # shape = (#samples, len_seq, dim_v) 
# Reshape based on the number of heads 
dense_qre = reshape_tensor(dense_q, 2, pre_attention = True) #shape = (#samples, #heads, dim_q/heads, len_seq)
dense_kre = reshape_tensor(dense_k, 2, pre_attention = True) #shape = (#samples, #heads, dim_k/heads, len_seq)
dense_vre = reshape_tensor(dense_v, 2, pre_attention = True)
attention_scores, res = self_attention(dense_qre, dense_kre,dense_vre, masking = create_look_ahead_mask(dense_q.shape[1]))
attention_scores.shape # 1 sample, 2 heads, len_seq = 27 

TensorShape([1, 2, 27, 27])

In [284]:
attention_scores[0][0][1] #first sample, first head, first word. the amount of attention needed to pay for each. 
#this shows that the model only pays attention to the first word to make a prediction for the second word. 

<tf.Tensor: shape=(27,), dtype=float32, numpy=
array([0.49960408, 0.50039595, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        ], dtype=float32)>

#### Define the self Attention

In [36]:
def self_attention(q,k,v,masking):
    """
    This function applied the self-attention mechanism to a given input. 
    
    """
    
    # Perform matrix multiplication on the last two dimensions
    dotqk = tf.matmul(q, k, transpose_b = True) #must be of size (batch_size, seq_len, seq_len) 

    dim_k = tf.cast(40,tf.float32) 
    normalized_dotqk = dotqk/tf.math.sqrt(dim_k)
    
    #then add the masking if masking if given" 
    if masking is not None: 
        normalized_dotqk += masking* -1e9
    
    attention_scores =  tf.nn.softmax(tf.cast(normalized_dotqk, dtype=tf.float32),axis = -1)
    res = tf.matmul(attention_scores,v) 
    
    return(attention_scores, res)
    

#### Define A feed forward neural network: 

In [37]:
def FullFeedForward(n_1, emb_size):#the model must return vectors of the same size as the embeddings of the input so can be combined with decoder
    model = Sequential([
    Dense(n_1, activation='tanh', name="dense1"), #relu? (#samples, len_seq, n_1)
    Dense(emb_size, activation='tanh', name="dense2")# linear? (#samples, len_seq, emb_size)
])
    return(model)
    

In [38]:
# Define a reshape_tensor which will be later on used for the Multi-head attention: 

def reshape_tensor(q_matrix, heads, pre_attention): 
    """
    
    """
    
    #pre_attention, we'll need to reform into 4d 
    if pre_attention:

        dense_qre = reshape(q_matrix, (shape(q_matrix)[0], shape(q_matrix)[1], heads, -1))
        dense_qre = transpose(dense_qre, ([0, 2, 1, 3]))
        
        
    #post_attention, we'll need to revert back to 3d: 1125, 2, 30, 15]
    else: 
        q_matrix_transpose = transpose(q_matrix, ([0,2,1,3]))
        dense_qre = reshape(q_matrix_transpose, (shape(q_matrix_transpose)[0], shape(q_matrix_transpose)[1], -1)) 
        
        
    return(dense_qre)
        

In [39]:
class MultiHeadAttention(Layer): 

    def __init__(self, dim_kv, dim_q, len_emb, heads, **kwargs):
        
        super(MultiHeadAttention, self).__init__(**kwargs) 
        self.heads = heads
        self.denseq = Dense(units = dim_q)
        self.densek = Dense(units = dim_kv)
        self.densev = Dense(units = dim_kv) 
        self.dense = Dense(units = len_emb)
        self.self_attention = self_attention
    
    def call(self,q,k,v,masking, **kwargs): #by passing self, you passed all the attributes you've defined above. 
       
        # Define the query, key, and value matrices: 
        #print(f"dim of q is {q.shape}, dim of k is {k.shape}")
        dense_q = self.denseq(q) # shape = (#samples, len_seq, dim_q)
        dense_k = self.densek(k) # shape = (#samples, len_seq, dim_k) 
        dense_v = self.densev(v) # shape = (#samples, len_seq, dim_v) 
        
        # Reshape: 
        dense_qre = reshape_tensor(dense_q, self.heads, pre_attention = True) #shape = (#samples, #heads, len_seq, dim_q/heads)
        dense_kre = reshape_tensor(dense_k, self.heads, pre_attention = True) #shape = (#samples, #heads, len_seq, dim_k/heads)
        dense_vre = reshape_tensor(dense_v, self.heads, pre_attention = True) #shape = (#samples, #heads, len_seq, dim_v/heads)
        
        # Calculate the attention scores: 
        attention_scores, res = self.self_attention(dense_qre, dense_kre,dense_vre,masking) #shape = (#samples, #heads, dim_q/heads, len_seq)
        # Revert the shape:
        attention_with_v = reshape_tensor(res, self.heads, pre_attention = False) #shape = (#samples, len_seq, dim_q)
        
        return(attention_with_v)


In [40]:
# Check if it works: 
dim_kv = 20 #we keep the dimension of k and q the same for the dot product to work. and then the dim of v the same so that mult happens
dim_q = 20
len_emb = 20
heads = 2 
masking = None
print(f"shape of input to the MHA: {input_encoder.shape}")
mha = MultiHeadAttention(dim_kv, dim_q, len_emb, heads)
mha(input_encoder, input_encoder,input_encoder, masking = None).shape


shape of input to the MHA: (7500, 27, 20)


### Define the Encoder

In [41]:
class Encoder(tf.keras.layers.Layer):
    
    def __init__(self, dim_kv, dim_q, heads, fnn_neurons, len_emb,len_human_vocab, iter, drop_rate):
        
        super(Encoder,self).__init__()
        self.heads   = heads
        self.iter    = iter
        self.len_emb = len_emb

        self.enc_emb = Embedding(input_dim=len_human_vocab, output_dim=len_emb, input_length=27)
        self.mha     = MultiHeadAttention(dim_kv, dim_q, len_emb, heads)
        self.fnn     = FullFeedForward(fnn_neurons, len_emb)
        
        self.norm1   = LayerNormalization(epsilon = 1e-6)
        self.norm2   = LayerNormalization(epsilon = 1e-6)
        
        self.drop1    = Dropout(rate = drop_rate)
        self.drop2    = Dropout(rate = drop_rate)
        self.drop3    = Dropout(rate = drop_rate)
        
        
    def call(self,x, training= False, enc_mask = False): 
        """
        This block calculates the Encoder output for number of iterations = iter.
        
        Input
        x        : input of the encoder - provided by the dataset available 
        training : if training == True, dropout layers will be active. 
        masking  : Boolean if True then the padding mask will be calculated and applied. 

        Output 
        x        : the final x returned will be the output of the encoder after iter loops. 
        
        """
        # Define the padding mask if True 
        padding_mask = create_padding_mask(x,self.heads) if enc_mask else None

        # Add Embedding Layer 
        x = self.enc_emb(x)

        # Add positional encoding
        x += pos_emb(x.shape[1], len_emb)
        x *= tf.math.sqrt(tf.cast(self.len_emb,tf.float32))
        # Encoder layers
        for _ in range(self.iter): 

            # Add dropout layer: 
            drop_x = self.drop1(x, training = training) 
            
            # Calculate the attention scores: 
            mha_scores = self.mha(drop_x, drop_x, drop_x, masking = padding_mask)
        
            # Add dropout and normalize: 
            dropout_1 = self.drop2(mha_scores, training = training)
            norm_1  = self.norm1(dropout_1 + x )
        
            #Run through a fully connected neural network: 
            fnn_output = self.fnn(norm_1) 
            
            # Add dropout: 
            dropout_2 = self.drop3(fnn_output, training = training)
        
            # Normalize: 
            x = self.norm2(dropout_2 + norm_1)
            
        return x
     

In [42]:
# Making sure the Encoder block works: 
dim_kv, dim_q, len_emb = 20,20,20
heads = 2
masking = False
fnn_neurons = 20
drop_rate = 0.1
print(f"shape of the input given: {input_encoder.shape}")
len_human_vocab = len(human_vocab) 
encoder = Encoder(dim_kv, dim_q, heads, fnn_neurons, len_emb,len_human_vocab, iter = 6, drop_rate = 0.1) 
enc_output = encoder(X_trainmod, training = True, enc_mask= True) #input_decoder is the embeddings + positional encodings 
enc_output.shape


shape of the input given: (7500, 27, 20)




TensorShape([7500, 27, 20])

### Define the Decoder 

In [43]:
class Decoder(tf.keras.layers.Layer): 

    def __init__(self, len_emb, dim_kv, dim_q, heads, 
                dd_model, iter, len_seq_out,  
                drop_rate = 0.1, epsilon = 1e-6):  #dd_model is the number of neurons in the last layer of decoder (dense with softmax) 
        
        super(Decoder, self).__init__()
        
        self.len_emb     = len_emb
        self.iter        = iter
        self.heads       = heads
        self.len_seq_out = len_seq_out

        self.emb_layer  = Embedding(input_dim=len_machine_vocab, output_dim=len_emb)
        self.mha1       = MultiHeadAttention(dim_kv, dim_q, len_emb, heads)
        self.mha2       = MultiHeadAttention(dim_kv, dim_q, len_emb, heads) 
        self.dense      = FullFeedForward(dd_model, len_emb)
        
        self.drop1       = Dropout(rate = drop_rate)
        self.drop2       = Dropout(rate = drop_rate)
        self.drop3       = Dropout(rate = drop_rate)
        self.drop4       = Dropout(rate = drop_rate)
        
        self.layernorm1 = LayerNormalization(epsilon = epsilon)
        self.layernorm2 = LayerNormalization(epsilon = epsilon)
        self.layernorm3 = LayerNormalization(epsilon = epsilon)
       

    def call(self, x, enc_output, training= False): #dec_mask = False): 
        
        """
        if training == True: 
            - The look-ahead mask will be defined within the model 
            - The model will make predictions for all time steps at once 
            - Dropout layers are on 
            
        if training == False: 
            - The look-ahead mask is None 
            - The model will make predictions sequentially, 
            - The model keeps predicting until the end of the sequence is reached, 
            - Dropout layers are inactive 

        Input 
        x    : The input sequence to the decoder 
        enc_output : The output of the Encoder passed to the Decoder 
        training   : Boolean value, if True the Dropout layers are active and look-ahead mask is applied 
        
        """
        
        len_seq = x.shape[1]

        # Define the padding mask if needed
        dec_pad_mask = create_padding_mask(x, self.heads) if training else None # this must also be modified if pads are not added. 

        # Inference mode
        look_ahead_mask1 = None if not training else create_look_ahead_mask(len_seq)
        
       
        # Add embeddings: 
        x = self.emb_layer(x)
        
        # Add positional encoding 
        x += pos_emb(len_seq, self.len_emb)
        x *= tf.math.sqrt(tf.cast(self.len_emb,tf.float32))
       
        for _ in range(iter):
            # Add a dropout layer: 
            x = self.drop1(x, training = training) 
               
            # Run through a MHA with the look-forward mask: 
            attn_mat1 = self.mha1(x, x, x, masking = look_ahead_mask1)
                
            # Add dropout here during training:  
            attn_mat1 = self.drop2(attn_mat1, training = training)
                
            # Add and Normalize: 
            attn_mat1_x = self.layernorm1(attn_mat1 + x)
                
            # Cross Attention
            # Define the mask if needed  #####-> made some changes here too for training replaced dec_mask 
            if training: 
                cross_mha_mask = np.zeros((x.shape[0], self.heads, attn_mat1_x.shape[1], enc_output.shape[1]))
                for i in range(cross_mha_mask.shape[2]): 
                    cross_mha_mask[:,:,i,:] = tf.repeat(dec_pad_mask[:,:,:,i], repeats=enc_output.shape[1], axis=2)
            else: 
                cross_mha_mask = None 
            # Run the cross attention 
            attn_mat2 = self.mha2(attn_mat1_x , enc_output, enc_output, masking = cross_mha_mask)
                
            # Add dropout during training: 
            attn_mat2 = self.drop3(attn_mat2, training = training) 
                
            # Add and Normalize: 
            attn_mat2_x = self.layernorm2(attn_mat2 +  attn_mat1_x) 
                
            # Run through a dense layer: 
            dense_output = self.dense(attn_mat2_x)
                
            # Add Dropout: 
            dense_output = self.drop4(dense_output, training = training)
                
            # Add and Normalize: 
            x = self.layernorm3(dense_output + attn_mat2_x) #(#samples, 12,20)-> #(#samples, 12,1) --> (#samples, 1, 12)
            
        return(x) 
               

In [95]:
#Check if it works after you've defined your output sequence (decoder input):  
dim_kv = 20
dim_q = 20
len_emb = 20 
heads = 2
masking = None
dd_model = 20
drop_rate = 0.1
iter = 1
len_seq_out = 11

func_decoder = Decoder(len_emb, dim_kv, dim_q, heads, 
                           dd_model, iter,len_seq_out, drop_rate = 0.1, epsilon = 1e-6)
decoder_output = func_decoder(y_train_dec_input, enc_output, training = False)
decoder_output.shape

TensorShape([7500, 11, 20])

In [45]:
decoder_output[0][0] #logits not probabilities 

<tf.Tensor: shape=(20,), dtype=float32, numpy=
array([-0.12847579, -1.1864185 , -1.0001552 ,  1.4832157 ,  0.35379234,
        0.69271374, -0.53467846,  0.3083954 , -0.8806666 ,  0.30249602,
        0.04884396, -1.3301146 ,  1.2648648 ,  1.0878731 , -0.91357577,
        0.17336014, -2.0089796 ,  1.3717271 , -0.4965886 ,  1.3923708 ],
      dtype=float32)>

### Modifying the target dataset to feed into the Transformer

Let us print out the y_trainmod: 

In [240]:
y_trainmod[0:10] #10 samples including the <sos> and <end> tokens 

array([[11,  3,  1,  1,  7,  0,  1,  5,  0,  3,  4, 12],
       [11,  2, 10,  9,  1,  0,  1,  3,  0,  2,  7, 12],
       [11,  2, 10, 10,  8,  0,  2,  3,  0,  1,  9, 12],
       [11,  2, 10, 10,  3,  0,  2,  2,  0,  3,  1, 12],
       [11,  2, 10,  8, 10,  0,  1,  9,  0,  2,  7, 12],
       [11,  2, 10,  9, 10,  0,  2,  3,  0,  3,  1, 12],
       [11,  2, 10, 10,  5,  0,  2,  3,  0,  1,  6, 12],
       [11,  3,  1,  2,  1,  0,  1,  9,  0,  2,  9, 12],
       [11,  2, 10,  8,  3,  0,  1,  3,  0,  1,  9, 12],
       [11,  3,  1,  1, 10,  0,  1,  5,  0,  1,  7, 12]])

The input of the decoder must start with the start token but the end token must be predicted by the model. Therefore, removing the end tokens from the input of the decoder; We will refer to the input of the decoder as **y_train_dec_input**.

In [198]:
y_train_dec_input  = y_trainmod[:,:-1] 
y_train_dec_input

array([[11,  3,  1, ...,  0,  3,  4],
       [11,  2, 10, ...,  0,  2,  7],
       [11,  2, 10, ...,  0,  1,  9],
       ...,
       [11,  3,  1, ...,  0,  2, 10],
       [11,  3,  1, ...,  0,  4,  1],
       [11,  3,  1, ...,  0,  3,  5]])

The expected output of the decoder will be y_trainmod but only shifted one time step forward and no <sos> token; we will refer to the target output as **y_trainmod_dec_output**.

In [241]:
y_trainmod_dec_output = y_trainmod[:,1:]
print(y_trainmod_dec_output.shape)
print(y_trainmod_dec_output[0:3])
print(f"this is the desired output to be predicted") 

(7500, 11)
[[ 3  1  1  7  0  1  5  0  3  4 12]
 [ 2 10  9  1  0  1  3  0  2  7 12]
 [ 2 10 10  8  0  2  3  0  1  9 12]]
this is the desired output to be predicted


### Some Notes on the Decoder 

- Training Mode

    The training mode uses a method called teacher training in which all parts of the input are fed into the model. The model is designed to make predictions one at a time but during training we will make 12 predictions parallally, meaning that for example the <sos> token is given, then it goes through a layer embeddig creating a vector of 20 indexes. We will then add the positional encodings. The model then goes through the causal MHA in which the attention scores between the input decoder words are calculated. Then, we add the look-ahead mask; this mask makes sure that at each iteration of prediction t the model only sees the t-1 words before the current word. Then the result goes through the cross MHA. The two feed forward neural networks (last one with a softmax activation function) to make predictions.

  Note that the teacher training ensures that no matter the prediction of the model, at the next time step, the correct output will be fed into the model. The correct output that is fed to the model during training will be replaced by the output of the decoder during the inference mode. But since we already know what the input of the next step should be during training, we can run 12 calculations parallaly. 
  
- Inference Mode

  This is the mode in which the deocder makes predictions iteratively for t = 1:12 time steps (given that the token at t=0 refers to the start token). At each time j, the model predicts the all the words from t = 1:j. We will take only the last prediction (which is the new word), taking the argmax of that vector of probabilities gives us the next token predicted. This token is then concatenated with the original input of the model and the decoder runs again to make predictions for the 1:j+1 words at time j+1.

How would the training and gradients work: 

In [232]:
def init_decoder(shape):
    
    """
    This is a function to initialize the inputs of the decoder during the inference mode 
    
    Input 
    shape: The number of samples to make a prediction for 
    
    """
    
    return np.full((shape, 1), 11)


### Define the Custom Learning Rate 

In [136]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  """
  Custom learning rate schedule that implements the learning rate function
  described in the original Transformer paper. The learning rate is increased
  linearly for the first `warmup_steps` training steps, and then decreased
  proportionally to the inverse square root of the step number.

  Args:
    d_model (int): the dimensionality of the model.
    warmup_steps (int): the number of steps taken to increase the learning rate
      linearly. Default is 4000.

  Attributes:
    d_model (float): the dimensionality of the model as a float.
    warmup_steps (int): the number of steps taken to increase the learning rate
      linearly.

  Methods:
    __call__(step): returns the learning rate at the given step.

  Returns:
    The learning rate at the given step.
  """
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    """
    Returns the learning rate at the given step.

    Args:
      step (int): the current training step.

    Returns:
      The learning rate at the given step as a float32 tensor.
    """
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

### Define the Transformer Model 

In [268]:
class Transformer(tf.keras.Model): 

    def __init__(self, len_emb, dim_kv, dim_q, heads, d_model,
                dd_model, iterEnc, iterDec, df_model, len_seq_out,
                drop_rate = 0.1, epsilon = 1e-6):
        
        super(Transformer, self).__init__()
        self.len_emb = len_emb
        self.len_seq_out = len_seq_out
        
        self.encoder = Encoder(dim_kv, dim_q, heads, d_model, len_emb, len_human_vocab, iterEnc, drop_rate = 0.1) #shape = (#samples, enc_len_seq, len_emb) 
        self.decoder = Decoder(len_emb, dim_kv, dim_q, heads, dd_model, iterDec, len_seq_out, drop_rate = 0.1, epsilon = 1e-6) #(#samples, dec_len_seq, len_emb) 
        
        self.dense = Dense(units = len_machine_vocab, activation = 'softmax')
        self.learning_rate = CustomSchedule(d_model)
        self.optimizer = tf.keras.optimizers.Adam(self.learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)
        
    def call (self, input_enc, input_dec, enc_pad_mask = True, training = True): 
        
        """
        this is the forward pass of the model 
        """
        
        num_samples = input_enc.shape[0]

        # Run through the Encoder
        enc_output = self.encoder(input_enc, training = training, enc_mask = enc_pad_mask)
        # Run through the Decoder 
        dec_outputs = self.decoder(input_dec, enc_output, training = training) #shape = (#samples,len_seq_out, len_emb]
        dec_outputs = self.dense(dec_outputs)  # shape = (#samples, len_seq_out , len_machine_vocab)
        return(dec_outputs)

    def predict(self,input_enc, enc_pad_mask = True, training = False): 
        
        # Initialize the input of the decoder 
        input_dec = init_decoder(input_enc.shape[0])
        
        for i in range(self.len_seq_out): 
            #print(f"iteration {i}")
            dec_output = self.call(input_enc, input_dec, training = False)
            #print(f"this is the shape of decoder output before slicing: {dec_output.shape}")
                
            # picking only the corresponding prediction for ith letter
            dec_output = dec_output[:,-1,:] 
            arg = tf.cast((tf.argmax(dec_output,axis = -1)),dtype = tf.float32)
                
            # Update the decoder output 
            input_dec = tf.concat([input_dec, arg[:, np.newaxis]], axis = -1)
            #print(f"input_dec after {i}th iteration : {input_dec}")
        
        return(input_dec)
        
    def evaluate(self, input_enc, expected_output): 
        predictions = self.predict(input_enc, enc_pad_mask = True, training = False)
        
        # Calculate accuracy
        accuracy = tf.reduce_mean(tf.cast(tf.equal(predictions, expected_output), tf.float32)) # finds the mean accuracy. 
        return(accuracy) 
        
    
    def train_step(self, input_enc, input_dec, y_trainmod, training=True, enc_pad_mask = True):
        with tf.GradientTape() as tape:
            # Forward pass
            predictions = self.call(input_enc, input_dec, enc_pad_mask = True, training=True)
            # Compute the loss
            loss = tf.keras.losses.sparse_categorical_crossentropy(y_trainmod, predictions, from_logits=True)
            # Take average across all samples in the batch 
            loss = tf.reduce_mean(loss)
        
        # Compute gradients
        gradients = tape.gradient(loss, self.trainable_variables)

        # Apply gradients to update weights
        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
        
        # Calculate accuracy
        predicted_classes = tf.argmax(predictions, axis=-1)
        accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted_classes, y_trainmod), tf.float32)) # finds the mean accuracy. 
        return loss, accuracy

        
    def fit_model(self, input_enc, input_dec, y_trainmod, epochs=10, batch_size=200):
        
        for epoch in range(epochs):
            total_loss = 0
            total_accuracy = 0
            num_batches = len(input_enc) // batch_size
            
            for i in range(0, len(input_enc), batch_size):
                batch_input_enc = input_enc[i:i+batch_size]
                batch_input_dec = input_dec[i:i+batch_size]
                batch_y_trainmod = y_trainmod[i:i+batch_size]
    
                loss, accuracy = self.train_step(batch_input_enc, batch_input_dec, batch_y_trainmod)
                total_loss += loss
                total_accuracy += accuracy
        
            avg_loss = total_loss / num_batches
            avg_accuracy = total_accuracy / num_batches
            print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss.numpy()}, Accuracy: {avg_accuracy.numpy()}")   


In [273]:
len_emb= 30 
dim_kv= 30
dim_q= 30
heads= 3
d_model=30
dd_model= 30
iterEnc= 6 
iterDec= 6
df_model= 30
len_seq_out= 11
drop_rate = 0.1
epsilon = 1e-6
num_samples = X_trainmod.shape[0]
model = Transformer(len_emb, dim_kv, dim_q, heads, d_model,
                dd_model, iterEnc, iterDec, df_model, len_seq_out,
                drop_rate = 0.1, epsilon = 1e-6)

This is the first block of code. if training ==True: Note that the model returns values for all time steps except for the <sos> token (T=11)

In [274]:
import warnings
# Suppress all warnings
warnings.filterwarnings("ignore")
# Call the transformer 
res = model(X_trainmod, y_train_dec_input, training = True, enc_pad_mask= False) 
res.shape 

TensorShape([7500, 11, 13])

In [275]:
model.predict(X_trainmod[0:10])

<tf.Tensor: shape=(10, 12), dtype=float32, numpy=
array([[11.,  9.,  9.,  9.,  9.,  9.,  1.,  6.,  6.,  6.,  6.,  6.],
       [11.,  9.,  9.,  9.,  9.,  9.,  1.,  6.,  6.,  6.,  6.,  6.],
       [11.,  9.,  9.,  9.,  9.,  9.,  1.,  6.,  6.,  6.,  6.,  6.],
       [11.,  9.,  9.,  9.,  9.,  9.,  6.,  6.,  6.,  6.,  6.,  6.],
       [11.,  9.,  9.,  9.,  9.,  9.,  6.,  6.,  6.,  6.,  6.,  6.],
       [11.,  9.,  9.,  9.,  9.,  9.,  1.,  6.,  6.,  6.,  6.,  6.],
       [11.,  9.,  9.,  9.,  9.,  9.,  6.,  6.,  6.,  6.,  6.,  6.],
       [11.,  9.,  9.,  9.,  9.,  9.,  6.,  6.,  6.,  6.,  6.,  6.],
       [11.,  9.,  9.,  9.,  9.,  9.,  1.,  6.,  6.,  6.,  6.,  6.],
       [11.,  9.,  9.,  9.,  9.,  9.,  1.,  6.,  6.,  6.,  6.,  6.]],
      dtype=float32)>

Also note that the model doesn't make 12 predictions but 11.

In [278]:
model.fit_model(X_trainmod, y_train_dec_input, y_trainmod_dec_output, epochs = 10) # around 170 iterations 130 already done 

Epoch 1/10, Loss: 0.1761823147535324, Accuracy: 0.9657984375953674
Epoch 2/10, Loss: 0.17642322182655334, Accuracy: 0.9660934209823608
Epoch 3/10, Loss: 0.1827087700366974, Accuracy: 0.9630221724510193
Epoch 4/10, Loss: 0.15859493613243103, Accuracy: 0.9724570512771606
Epoch 5/10, Loss: 0.1602710336446762, Accuracy: 0.9702333807945251
Epoch 6/10, Loss: 0.15363916754722595, Accuracy: 0.9731572270393372
Epoch 7/10, Loss: 0.16199634969234467, Accuracy: 0.9702948331832886
Epoch 8/10, Loss: 0.16432568430900574, Accuracy: 0.9698035717010498
Epoch 9/10, Loss: 0.1478009819984436, Accuracy: 0.9749385714530945
Epoch 10/10, Loss: 0.16093496978282928, Accuracy: 0.9705649614334106


In [279]:
predictions = model.predict(X_testmod[0:10], init_decoder(10), training = False).numpy()
predictions

array([[11.,  3.,  1.,  1.,  8.,  0.,  1.,  4.,  0.,  1.,  8., 12.],
       [11.,  2., 10., 10., 10.,  0.,  1., 10.,  0.,  1.,  7., 12.],
       [11.,  2., 10., 10., 10.,  0.,  2.,  1.,  0.,  1.,  6., 12.],
       [11.,  2., 10.,  8.,  1.,  0.,  2.,  3.,  0.,  1.,  7., 12.],
       [11.,  3.,  1.,  1.,  3.,  0.,  1.,  3.,  0.,  1.,  8., 12.],
       [11.,  2., 10.,  8.,  1.,  0.,  1.,  9.,  0.,  1.,  8., 12.],
       [11.,  2., 10.,  9.,  7.,  0.,  2.,  1.,  0.,  4.,  2., 12.],
       [11.,  3.,  1.,  1.,  6.,  0.,  2.,  3.,  0.,  4.,  1., 12.],
       [11.,  2., 10.,  8.,  4.,  0.,  2.,  1.,  0.,  1.,  3., 12.],
       [11.,  2., 10.,  8.,  8.,  0.,  2.,  3.,  0.,  3.,  7., 12.]],
      dtype=float32)

In [280]:
#predictions = predictions.numpy()
for j in range(10):#predictions.shape[0]):
    x = []
    for i in range(12): 
        x = x + reverse_lookup([predictions[j][i]])
        result = ''.join(x)
    print(f"predictions: {result},\n Expected output {X_test[j]}\n")


predictions: <sos>2007-03-07<end>,
 Expected output 7 mar 2007

predictions: <sos>1999-09-06<end>,
 Expected output 06.03.99

predictions: <sos>1999-10-05<end>,
 Expected output tuesday october 5 1999

predictions: <sos>1970-12-06<end>,
 Expected output 6 december 1970

predictions: <sos>2002-02-07<end>,
 Expected output 07 feb 2020

predictions: <sos>1970-08-07<end>,
 Expected output friday august 7 1970

predictions: <sos>1986-10-31<end>,
 Expected output friday october 31 1986

predictions: <sos>2005-12-30<end>,
 Expected output 30 december 2005

predictions: <sos>1973-10-02<end>,
 Expected output tuesday october 2 1973

predictions: <sos>1977-12-26<end>,
 Expected output 26 dec 1977



In [282]:
model.evaluate(X_testmod,y_testmod)

<tf.Tensor: shape=(), dtype=float32, numpy=0.95683336>

#### Some intuition 

Here is the situation we are dealing with: 
the call method does predict both the testing and training datasets accurately but this is only when the y_trainmod or testmod is given to it. if we give it the y_input that is just the start token, the loss seems to be infinitiy. but generally speaking, during the training process, we must be able to give the y_trainmod to it and this should make no difference in the output prediction . 

so check if the look-ahead y_trianmod 

transform the training loop such that it'll replace the tensor with the predicted value. but in the testing we have a loop and in each loop we might have to actually pick the last prediction because the transfomer creates a 12,13 predictions and you'd pick the last one to predict. but why? if the transformer model predicts the words for all and you only need say the second word, then you take only the second word and run it again. let it make predictions for all but you only need the corresponding prediction. but what you are doing in the decoder tensting section is that the model still makes predictions for all of the len_seq_out (12) but you then run it all through another dense layer to collapse it. if the transformer's model is created correctly, it must make predictions for all of the time sequences (12). but remind me again, why do we take only the last prediction? because technically this must be the end sequence to be predicted why do we take the last prediction? 
the transformer should only make predictions for the next word not all the sentences. in that case you only take the last time-step predcition and this referes to the lastest word predicted. 

I know one approach. that we remove the layer that collapses all the 12 time -step predictions into 1. 

so our model does well both for the testing and training datasets in the training mode. when it is predicting all the words at the same time. are we predicting all the words at the same time? yes. but in order to make the accurate predictions oh the look-ahead mask. the look ahead mask what does it say? not to look at the words that are considered not given. right. but your input is a set of integers. these ingegers are then given to the model. so the look-ahead mask must know which algorithms to hide? no each integer in the vector integer input will be mapped to a row. so then the model needs to know which rows to completely ignore. so essentailly the input of the decoder is say 12,20: 
in the sake of training, the input of the decoder is vector (12,) containing the correct output. then this vector is mapped to (12,20); so to each integer index is mapped to a row of integers. now the model looks at all of them with respect to another to learn if there are any intera dependencies between them. but the input has all the correct values. to produce the correct output the model simultanously predicts all the words. but to predict the first word, the model must not see what the correct input is for the first word. so for the first prediction, the model will only see the start token. which is 11. then 11 must be padded to a vector of [11, 0*11]; then this vector is mapped to an embedding so now we have a matrix of 12x20 notice that the model must only pay attention to the first row and no attention to the rest of the words. in the attention mechanism, the attention scores will be of dimension 12x12 implying how much attention must be given to every word in the sequence. so in the first time step prediction, the model must not pay attention to any letters except the first one. so we'll need an attention mask such that when added to the model, it will not pay any attention to any index larger than 0 index. so the attention mask that must be given to the model is 


but if the decoder is making predictions for all of the time steps simultanously, then for the first time step we only take the start token then int 11 will be mapped to (12,20). my question is how would you parallely make predictions for all time steps? 
we have created a look-ahead mask that for each row, it'll say what words to focus on. but wouldn't each time step has its own attention matrix? yes. ok then the first attention matrix will be added to a mask of [0 ,1 ,1 ,1,...1] which forces all rows of the attention scores to only pay attention to the first word. but the problem is that all those words are being mapped. so we literally have a 12, complete vector given as the input of the decoder. so then we calculate the attention scores which implies how much attention must be given to each word. so we're saying when predicting the first word, only pay attentino to the start token and nothing else. but when it comes to the second row, meaning second time-step, only pay attention to the first two tokens. but the thing is that each one of them is an embedding no? no they will all be attention scores. so the second row would entail the attention scores for the rest of tokens to become 0. 

then moving to the next attention mechanism, the first row will be mapped to the prediction based on the enc output and the starting token right? yes. so then it'll be one row of len_emb this is the output of the attention 1. and then this will be mapped to a query which means that only the columns will differ. but the number of rows will be the same only 1. then the key is produced from the enc_output and has dim (27,20) say you map to 20 columns so then a 1x20 is multiplied by a 20x27 we would get a 1x27 row of attention scores we then want to multiply this with its corresponding row 1x20 dimensions won't match. now the thing is that we will make prediction for all of the time steps. doesn't make sense. 



let's start again; 
we have the start token fed into the decoder; then the decoder input will run through an embedding layer which is (1,512). then into the MHA1 where it is mapped to query, key and value matrices. the query and key both have dimension 1x20 so when multiplying, we will get a scalar and then once run through the softmax it'll give us the attention score of 1 with dim 1x1 which is then multipleid to the value dim 1x20 to give us 1x20 vector. since we only have one token, the attention score simply represents how similar the start token is to itself.

Note that if we gave the model 2 words instead of 1, then those 2 words must be mapped to 2x512 embeddings layer. then the query and key then dot produdct the attention scores will be of dimension 2x2. if 3 words given then attention scores are 3x3. 


note that the dimension is different from what you are training the model to have. in our model you pad the sequence to be equal in length to the complete input. 


also note that the look-ahead mask is only given when the model is in training phase. that is when all the words are simultanously processes but in truth we must run it row by row. 

once into the MHA 2, we have a row vector of 512 dimensions. and this will be mapped to a query 1x512. then we have the enc_output that has 3x512 in it. the dot produce will give us a 1x3 vector saying how similar the start token is to each row of the enc_output. 

if we have 2 words in the input-dec, then we would have 2x512 as the query and the key would be a 3x512. then result of the dot product will be a matrix of 2x3. each row will show the attention scores or the measurement of the dependencies of that word with each word in the encoder. 

so taking our 1x3 attention scores, we multiply it with the value which is 3x512. we end up with a 1x512 vector. this vector is then used to make predictions. how to make predictions and with this one given row how could we make predictions on all of the time steps? we don't. we only produce of row of probabilities predicting the next word. 

what if program the transformer such that the last prediction is the for the last time step along the 12 steps? 

in the cross-attention, imagine we have the y_input as [11,0,0,...,0]; in that case, when we are calculating the dependencies of this input sentence with the encoder output, we want to only make attention scores for the first word and all the encoder output i.e. we want the first word attention scores with all 27 other words. but that's it. no other attention must be made. 

you can modify your code such that if the input given is none, then the model generates it's own values. 

analyze how the input of the decoder with its dimensions merges with the input coming from the encoder. 

Come back to the imported libraries and clean. 

one thought is that the model learns to predict the start token when the start token is given to it. so then the start token is appended to the result and once the start token is given the model again predicts the start token. so what if we give it another input? 