The objective of this project is to: 
* Create positional encodings to capture sequential relationships in data
* Calculate scaled dot-product self-attention with word embeddings
* Implement masked multi-head attention
* Build and train a Transformer model

In [1]:
# Loading the required packages: 
import tensorflow as tf
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tensorflow.keras.layers import Embedding, MultiHeadAttention, Dense, Input, Dropout, LayerNormalization, Layer
from tensorflow.keras.models import Sequential
from tensorflow import  reshape, shape, transpose

from transformers import DistilBertTokenizerFast #, TFDistilBertModel
from transformers import TFDistilBertForTokenClassification


from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import PorterStemmer,LancasterStemmer
import re

from sklearn.model_selection import train_test_split 

We want the Softmax function that assigns the attention scores to avoid assigning any attention score to the padded parts of the sequence. So, instead we can either define a function that replaces vectors of all zeros with negative infinity (-1e-9) or when creating the padded embeddings for each input, we can assign -1e-9 to every padded token. But if we add the padding before going through the dot product attention (before the softmax), it is possible that through multiplication with matrices q,k, and v the padded vectors grow larger and then when we run the resultant matrix through softmax, it might again not assign 0 attention scores to the padded sequences. Therefore, the padded mask must be added after the dot product. Then apply Softmax, then multiply with the V matrix. Where to normalize? we will normalize the attention scores after the dot product before masking is applied. 

In [2]:
#Loading the data: 
CustomerFeed = 'Canva_reviews.xlsx'
df = pd.read_excel(CustomerFeed)

print(df)

                                               reviewId            userName  \
0     gp:AOqpTOFxf3fttcT5DSvFIn9KPp5FErgH9yC533Fmoxv...      Donna Caritero   
1     gp:AOqpTOEq6rNIWLnPV4KFTctWvm0mpGEQljtD6mvy1H-...  Soumi Mukhopadhyay   
2     gp:AOqpTOE86hSyPRHZgYt28Uk5zGe4FZGb1hkmtFDiYJ2...   Theknown _unknown   
3     gp:AOqpTOHSuKkVTcM3QgCCKysHQlxEnk2ocOKsUMiMIJy...        Anthony Dean   
4     gp:AOqpTOEOrZt5H6jXPiplJyffCd5ZBnVXACTWgwNsF1R...   Neha Diana Wesley   
...                                                 ...                 ...   
1495  gp:AOqpTOHhnXMpylU3f-1V1KbR2hwWArOilxPlKI6K4xY...            Reen Ali   
1496  gp:AOqpTOEcz62DHS-amqTB5xGMhM4_R0UJpcv_HDNny9i...     Shaurya Chilwal   
1497  gp:AOqpTOFMqEqa_kpp29Q8wjcBmKUCAvOQGQx4KZQ8b83...           GK Gaming   
1498  gp:AOqpTOGY4z3pUxeiqGzn2ad3Noxqlbm-9DZ3ksHqD1_...    1203_Vani Sharma   
1499  gp:AOqpTOFVGZ0MXyR-Gv_d2cYf2KD709Hwple_u7OZE4y...           MeLLy EcK   

                                              userI

In [3]:
df = df[["review", "Sentiment"]]
df.head()

Unnamed: 0,review,Sentiment
0,Overall it's really an amazing app. I've been ...,Negative
1,Hey! Yes I gave a 5 star rating... coz I belie...,Positive
2,Canva used to be a good app! But recently I've...,Negative
3,"It's a brilliant app, but I have just one prob...",Negative
4,This was such a great app. I used to make BTS ...,Negative


In [4]:
def edit_txt(review):
    """
    This function receives a text and returns it edited as follows: 
    1, all words converted to lower case 
    2, integers removed
    3, tokenize the words 
    4, punctuation removed 
    5, common words that are unnecessary are removed. 
    """
    
    review_edited = []

    #Converting to lower case: 
    review_edited = review.lower() 
    
    #Removing integers: 
    pattern = r'[0-9]'
    # Match all digits in the string and replace them with an empty string
    review_edited = re.sub(pattern, '', review_edited) 

    #Tokenize the comment: 
    review_edited = word_tokenize(review_edited) 

    #Removing punctuation 
    tokenizer = RegexpTokenizer(r'\w+')
    review_edited = [''.join(tokenizer.tokenize(word)) for word in review_edited if len(tokenizer.tokenize(word))>0]

    #Removing common words: 
    remove_list = stopwords.words('english') 
    to_remove = [ "not",'don',"don't",'should',"should've", 'ain','aren',"aren't",'couldn',"couldn't",'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',"hasn't",'haven',"haven't",'isn',"isn't",'mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',"shan't",'shouldn',"shouldn't",'wasn',"wasn't",'weren',"weren't",'won',"won't",'wouldn', "wouldn't"]
 
    review_edited = [word for word in review_edited if not word in remove_list]
    return(review_edited) 


In [5]:
# Extract the reviews: 
x = df["review"] 

#Modify the text to test the function reviews_edited: 
reviews_edited = [edit_txt(review) for review in x]
print(reviews_edited[13])
print(x[13])

# Define the target dataset and extract the unique rankings: 
y = df["Sentiment"].tolist()
ranking = np.unique(y)
ranking = ranking.tolist()
ranking

['unable', 'save', 'work', 'nothing', 'works']
Unable to save my work. Nothing works :(


['Negative', 'Positive']

### <font color = "red"> Do we need the following? </font>

In [6]:
# Creating the dictionary: 
Split = [] 
Dic = []
dictionary = np.unique([word for review in reviews_edited for word in review]).tolist()

# Add extra padding to limit the length of the input: 
dictionary = dictionary + ["<pad>"]
dictionary[1:10]

['aa',
 'aap',
 'ability',
 'able',
 'absolutely',
 'acc',
 'accepted',
 'access',
 'accessibilities']

In [7]:
# Split the dataset into training and testing datasets: 
#x = x.to_list()
X_train, X_test, y_train, y_test = train_test_split(x,y, 
                                   random_state=104,  
                                   test_size=0.25,  
                                   shuffle=True) 

In [8]:
# Apply the edit_txt function to both text corpus: 
X_train = [edit_txt(comment) for comment in X_train]
X_test = [edit_txt(comment) for comment in X_test]

In [9]:
# Load the word embeddings (Glove word embeddings) 
embeddings_dict = {}
with open("glove.6B.50d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

words =  list(embeddings_dict.keys())
vectors = [embeddings_dict[word] for word in words]

In [10]:
# Encoding the input with Glove Word Embeddings: 
def gvec_input(x,m,e): 
    """
    
    This function takes any input, x, and returns a glove vector based on the 
    words introduced in the vocabulary (400,000 words). This function returns k vectors where k is the number of words in the 
    sentence. Every vector corresponds to a word in the dictionary and each entry will describe a feature of the word. 
    
    inputs: 
    
    x (string) : a statement from customers. 
    m (int)    : size of the sequence 
    e (int)    : size of the embeddings 
    outputs: 
    v (m,n)    : where m is the number of words in the sentence and n = 50 is the number of total features describing a word. 

    
    """
    n = len(x)
    gv = np.zeros((n,m, e))
    
    for i in range(0, n): #looping over each comment 
        txt = x[i] #select the ith comment  
        txt = (txt[:m] if len(txt) > m else txt + ['<pad>'] * (m - len(txt))) #shorten or add extra padding
        for l in range(m): #looping over each word 
            
            # add the embedding of all ones for pads
            if txt[l] == "<pad>": 
                gv[i,l,:] = np.ones(e) 
                
            # if a word is not is the list of Glove embeddings, then assign an array which is the average of all embeddings:    
            elif txt[l] not in words: 
                gv[i,l,:] = np.mean(vectors, axis = 0)
            # add the word embeddings: 
            else: 
                gv[i,l,:] = embeddings_dict[txt[l]]
    return(gv)

In [11]:
# Limit the length of the sequence: 
m = 30 
# The length of the embeddings: 
e = 50
X_trainmod = gvec_input(X_train,m,e) 
X_testmod = gvec_input(X_test,m,e)

In [12]:
print(X_trainmod[0])
X_trainmod.shape

[[ 0.79238999  0.21864     0.68711001 ... -0.066753   -0.39660001
   0.74818999]
 [ 0.36998999  0.082841    0.16883001 ...  0.0053184  -0.50853002
   0.24986   ]
 [ 0.02648     0.33737001  0.065667   ... -0.3398     -0.23043001
   0.19069   ]
 ...
 [ 1.          1.          1.         ...  1.          1.
   1.        ]
 [ 1.          1.          1.         ...  1.          1.
   1.        ]
 [ 1.          1.          1.         ...  1.          1.
   1.        ]]


(1125, 30, 50)

In [22]:
# Calculate the angles for positional embeddings: 
def get_angles(pos, k, d):
    """
    Get the angles for the positional encoding
    
    Arguments:
        pos -- Column vector containing the positions [[0], [1], ...,[N-1]]
        k --   Row vector containing the dimension span [[0, 1, 2, ..., d-1]]
        d(integer) -- Encoding size
    
    Returns:
        angles -- (pos, d) numpy array 
    """
    
    # Get i from dimension span k
    i = k//2
    # Calculate the angles using pos, i and d
    angles = pos/ (10000)**(2*i/d)

    
    return angles
    
def pos_emb(len_seq,len_emb): 
    
    """
    This function creates the positional embeddings for all the words in the sequence based on: 
    
    Input: 
    len_seq (int) : The length of the sequences inputed into the model. 
    len_emb (int) : The length of the word embeddings for every word in the sequence. 

    Note: the size of the positional encoding and the word embeddings must match in order to add them in the next step. 

    Output: 
    res (np.array(len_seq, len_emb)) : ith row of this matrix represents the positional encodings for the ith position in the sequence. 

    """

    len_i = int(len_emb/2)

    # Initialize the matrix to save positional encodings: 
    res = np.zeros((len_seq,len_emb))
    angles = np.zeros((len_seq,len_emb))
    
    #for each position in the sequence 
    for pos in range(len_seq): 
        
        #calculate the angles: 
        for i in range(len_i): 
            angles[pos,2*i] = pos/(10000**(2*i/len_emb))
            angles[pos, 2*i +1] = pos/(10000**(2*i/len_emb)) 
        
        # Calculate the entries corresponding to each position 
        #for j in range(len_i): 
        res[pos, 0::2] = np.sin(angles[pos,0::2])
        res[pos,1::2] = np.cos(angles[pos,0::2])
            
    return(tf.cast(res.reshape(1,len_seq,len_emb), dtype=tf.float32))


In [23]:
# Create the positional embeddings: 
position_enc = pos_emb(X_trainmod.shape[1],X_trainmod.shape[2])
position_enc.shape

TensorShape([1, 30, 50])

In [24]:
# Add the positional encoding to the word embeddings: 
X_trainmod = X_trainmod + position_enc 
print(X_trainmod.shape)

X_testmod = X_testmod + position_enc 
X_testmod.shape

(1125, 30, 50)


TensorShape([375, 30, 50])

In [25]:
def padding_mask(tensor): 
    """
    this function will encode the padded sequences as -1e-9 so that when they are run through a Softmax, the value will be equal to zero. 
    """
    # Identify rows where all elements are zero
    is_zero_row = tf.reduce_all(tf.equal(tensor, 0), axis=1)

    # Expand is_zero_row to match the shape of tensor
    is_zero_row_expanded = tf.expand_dims(is_zero_row, axis=-1)

    # Replace zeros with -1e-9 where the row is all zeros
    result_tensor = tf.where(is_zero_row_expanded, tf.constant(-1e-9, dtype=tf.float64), tensor)
    return(result_tensor)


In [None]:
#now time to define the self_attention: 

In [26]:
def self_attention(q,k,v, masking):
    """
    this function calculates a self_attention mechanism 
    res are the final attention scores. 
    """
    
    
    # Perform matrix multiplication on the last two dimensions
    dotqk = tf.matmul(q, k, transpose_b = True)

    dim_k = tf.cast(k.shape[-1],tf.float32)
    normalized_dotqk = dotqk/tf.math.sqrt(dim_k)
    
    #then add the masking if masking if given" 
    if masking is not None: 
        normalized_dotqk += (1 - masking)* (-1e9)
    
    attention_scores =  tf.nn.softmax(tf.cast(normalized_dotqk, dtype=tf.float32),axis = -1)
    res = tf.matmul(attention_scores,v) 
    
    return(res)
    

### <font color="red"> Review masked functions</font>

Preferably, we want the input of the Encoder structure to already have the word embeddings and the positional encodings. In the Encoder structure, we will have the multi-head attention (think of it as running the self-attention multiple times) and a fully connected neural network which will be called FullFeedForward. 

In [27]:
def FullFeedForward(n_1, emb_size):#the model must return vectors of the same size as the embeddings of the input so can be combined with decoder
    model = Sequential([
    Dense(n_1, activation='tanh', name="dense1"), #relu? (#samples, len_seq, n_1)
    Dense(emb_size, activation='tanh', name="dense2")# linear? (#samples, len_seq, emb_size)
])
    return(model)
    

# Questions
Why is the embedding size also taken as an argument in MHA? we get matrices q, k, and v. The product of qTk will give a dim_k or dim_q by emb_size. The final product in the attention mechanism must yield a matrix of the same length of seq and emb_size. 

* look into the command of MHA.
* LayerNormalization.

### Multi-head attention? 
We will input 3 xs (possibly they could be different?) then the inputs are mapped linearly to give us the matrices Query, Key and Value. 
* dimension x (#batches, len_seq, len_emb)
* dim of k:$K^T x$ if k is (len_seq,dim_k), then its transpose is (dim_k, len_seq), the resultant matrix is going to have dim (dim_k, len_emb)
* dim of q: $Q^T x $; if q is (len_seq,dim_q), then its transpose is of dim (dim_q, len_emb) and the resultant dot product gives (dim_q,len_emb)
* Similarly, for the multiplication of $V^T x$, we have the value being of dimension (dim_v, len_emb).
  * if it is a self-attention (attention with only one head), then $qk^T$ has dim (dim_q, dim_k), scale, add the mask and dropout if given.
  * if it has n heads, then we will produce query and key matrices of dimensions dim_q/n, dim_k/n. After the dot product, the result is of dim (dim_q/n, dim_k/n). We then concatenate these results to get the desired dim of (dim_q,dim_k). $ \bold{make sure you understand the concatenation} $
* dot prodcut v (dim_v, len_emb) qTk (dim_q, dim_k) --> $ qTk .v $ Note that here dim_k must be the same as the dimension of v for this dot product to occur.
* just like magic, you have the attention scores now and the result is a matrix of (dim_k, len_emb).
* so then we add our initial x and normalize too. in order to add x to the attention scores, the attention scores need to have the same dim as x. meaning that dim_k needs to be the same as the len of the sequence.

### Fully Connected Neural Network: 

We feed the matrix out of the attention mechanism into the fully connected neural network. how many neurons? what matters is that the output layer must have len_emb neurons in order to match the dim of x. why do we need them to match? becoz we again add the input seq x to the result (after another layer of normalization). 

Then copy the result, pass as key and value to the decoder network. 

# Question isn't the dot product we are talking here actually a cross product?!

In [41]:
def reshape_tensor(q_matrix, heads, pre_attention): 
    
    #pre_attention, we'll need to reform into 4d 
    if pre_attention:

        dense_qre = reshape(q_matrix, (shape(q_matrix)[0], shape(q_matrix)[1], heads, -1))
        dense_qre = transpose(dense_qre, ([0, 2, 3, 1]))
        
        
    #post_attention, we'll need to revert back to 3d: 
    else: 
        q_matrix_transpose = transpose(q_matrix, ([0,3,1,2]))
        dense_qre = reshape(q_matrix_transpose, (shape(q_matrix_transpose)[0], shape(q_matrix_transpose)[1], -1)) 
        
        
    return(dense_qre)
        

Do you wanna define another function that takes the dims you'd like and deliver you the query, key and value matrices? 
because now we no longer need to have as inputs, the dim_kv and dim_q. would we need the masking? yes in self_attention. 
we need the mha to take 3 arguments as q,k,v. 

from tensorflow.keras.layers import Layer

class MultiHeadAttentionM(Layer):  # Ensure this name matches in `super()`
    def __init__(self, len_emb, heads, masking, query_mat, 
                 key_mat, value_mat, **kwargs):
        
        super(MultiHeadAttention, self).__init__(**kwargs)  # Ensure the class name here is correct
        self.heads = heads
        self.masking = masking
        self.d_model = len_emb
        self.mha = self_attention()
        self.query = query_mat 
        self.key = key_mat
        elf.value = value_mat 

    
    def call(self,x,**kwargs): #by passing self, you passed all the attributes you've defined above. 
     
        # Reshape: 
        dense_qre = reshape_tensor(self.query, self.heads, pre_attention = True) #shape = (#samples, #heads, dim_q/heads, len_seq)
        dense_kre = reshape_tensor(self.key, self.heads, pre_attention = True) #shape = (#samples, #heads, dim_k/heads, len_seq)
        dense_vre = reshape_tensor(self.value, self.heads, pre_attention = True) #shape = (#samples, #heads, dim_v/heads, len_seq) 
        
        # Calculate the attention scores: 
        attention_scores = self.mha(dense_qre, dense_kre,dense_vre, masking) #shape = (#samples, #heads, dim_q/heads, len_seq)
        
        # Revert the shape:
        attention_with_v = reshape_tensor(attention_scores, self.heads, pre_attention = False) #shape = (#samples, len_seq, dim_q)
        
        # Run through another dense and add to the initial x: 
        res = Dense(units = self.d_model)(attention_with_v)  # shape = (#samples, len_seq, d_model) 
        #how to add the dropout and the normalization layers? 
        return(res)


In [42]:
from tensorflow.keras.layers import Layer

class MultiHeadAttention(Layer):  # Ensure this name matches in `super()`
    def __init__(self, dim_kv, dim_q, len_emb, heads, **kwargs):
        
        super(MultiHeadAttention, self).__init__(**kwargs)  # Ensure the class name here is correct
        self.dim_k = self.dim_v = dim_kv
        self.dim_q = dim_q
        self.heads = heads
        self.d_model = len_emb

    
    def call(self,q,k,v, masking = None, **kwargs): #by passing self, you passed all the attributes you've defined above. 
       
        # Define the query, key, and value matrices: 
        dense_q = Dense(units = self.dim_q)(q) # shape = (#samples, len_seq, dim_q)
        dense_k = Dense(units = self.dim_k)(k) # shape = (#samples, len_seq, dim_k) 
        dense_v = Dense(units = self.dim_v)(v) # shape = (#samples, len_seq, dim_v) 
        
        # Reshape: 
        dense_qre = reshape_tensor(dense_q, self.heads, pre_attention = True) #shape = (#samples, #heads, dim_q/heads, len_seq)
        dense_kre = reshape_tensor(dense_k, self.heads, pre_attention = True) #shape = (#samples, #heads, dim_k/heads, len_seq)
        dense_vre = reshape_tensor(dense_v, self.heads, pre_attention = True) #shape = (#samples, #heads, dim_v/heads, len_seq) 
        
        # Calculate the attention scores: 
        attention_scores = self_attention(dense_qre, dense_kre,dense_vre, masking) #shape = (#samples, #heads, dim_q/heads, len_seq)
        
        # Revert the shape:
        attention_with_v = reshape_tensor(attention_scores, self.heads, pre_attention = False) #shape = (#samples, len_seq, dim_q)
        
        # Run through another dense and add to the initial x: 
        res = Dense(units = self.d_model)(attention_with_v)  # shape = (#samples, len_seq, d_model) 
        
        return(res)


how to add the dropout and the normalization layers? 

In [44]:
dim_kv = 30 
dim_q = 20 
len_emb = 50
heads = 2 

masking = None

function = MultiHeadAttention(dim_kv, dim_q, len_emb, heads)
function(X_trainmod, X_trainmod,X_trainmod, masking = None).shape

TensorShape([1125, 30, 50])

* How do we initialize the q, k, and v matrices?

    A multi-head attention class is defined where based on the training x, created the q,k, and v matrices by applying a dense layer to the input sequence each time. 


* How is this model trained?
  Still a question.

* For the encoder layer, what attributes do we need?
   * Better question to ask is what do we want the Encoder layer do?
     When running the encoder layer, we want to input the input sequence; then this input sequence will go through to add word embeddings, then positional encodings. We then run the attention model on this to get the attention scores added to the structure. we then normalize and add dropout. Then run through a fully connected neural network, add x, normalize and add another dropout layer.

* What is the purpose of the Dropout function and what are its arguments?
 
  let's assume the dropout rate is 0.1. During training, the dropout layer randomly selects 10% of the input and replace it with zeros. This prevents the model to overfit the parameters based on the training set and also prevents the model to become too reliant on certain parameters. During the call function, make sure you set the training argument to 'True' so that the model will apply dropout only during training and does nothing during the inference mode (making predictions). 

* As an alternative to defining our own Multi-Head attention, we could use the one built-in Tensorflow package. Check out if the calculations are all the same and what the arguments to this layer are. 

In [72]:
class Encoder(Layer):
    
    def __init__(self, dim_kv, dim_q, heads, fnn_neurons, len_emb, drop_rate, iter):
        
        super(Encoder,self).__init__()
        self.mha     = MultiHeadAttention(dim_kv, dim_q, len_emb, heads)
        self.norm    = LayerNormalization(epsilon = 1e-6)
        self.drop    = Dropout(rate = drop_rate)
        self.fnn     = FullFeedForward(fnn_neurons, len_emb)
        self.iter    = iter

        
    def call(self,x,training, masking): 
        
        
        for _ in range(self.iter): 

            # Add dropout layer: 
            drop_x = self.drop(x, training = training) 
            
            # Calculate the attention scores: 
            mha_scores = self.mha(drop_x, drop_x, drop_x, masking = masking)
        
            # Add dropout and normalize: 
            dropout_1 = self.drop(mha_scores, training = training)
            norm_1  = self.norm(dropout_1 + x )
        
            #Run through a fully connected neural network: 
            fnn_output = self.fnn(norm_1) 
            
            # Add dropout: 
            dropout_2 = self.drop(fnn_output, training = training)
        
            # Normalize: 
            x = self.norm(dropout_2 + norm_1)
            
        return x
            
        

In [70]:
dim_kv = 30 
dim_q = 20 
len_emb = 50
heads = 2 
masking = None 
fnn_neurons = 20
drop_rate = 0.1
function = Encoder(dim_kv, dim_q, heads, fnn_neurons, len_emb, drop_rate,10)
output_encoder = function(X_trainmod, masking = None)
output_encoder.shape

TensorShape([1125, 30, 50])

the next task is to have an encoder layer. you then have a decoder and then the transformer. to the transformer, we would like to only input the x and not modify to add embeddings or positional embeddings. but for the encoder part, we would like to repeat the encoder part multiple times. so essentially, we want to add a loop to the encoder section. how to do that? 
what is going to be on repeat? the full encoder layer.
so what would be the input to the encoder? x 
at first, the x will be the training set but for the next iterations on the loop, we will take the output of the encoder and input for the next time. so, this in that sense it sequential but the length of the senquence is actually much less. I would like to see how would repeating the loop actually benefit training. 
* try adding multiple iterations of the encoder and then try with only one layer of encoder and see if there is a difference in the model performance. 

cool thing to know, you can use the underscore for any variable that is not gonna be used later. so for example, if you know a function will output 3 vars and you only need the first two, you can have the third variable saved as an underscore. or during a for loop, you can write for _ in range() this means that the place holder for the iterations will actually not be used inside the loop so you don't bother defining it. 

* Note that we must make sure in the attention paper bahdanua, we defined the correct variables to be saved and disregarded in the post-attention LSTM. 

So what does the decoder do? 
it seems that the decoder but for the decoder to start we need the encoder code in coursera to be complete we then can move to it? not right now I am primed to work to have at least an understanding of the decoder before going through it we do not necessarily start the code right away. 

so what does a decoder do? the decoder, has also an input that is prob encoded with embeddings and the pos encodings. then the decoder must go through yet another mha. to this mha that takes 3 inputs, we input the query as the input of the decoder and we input the output of the encoding as the key and value. why? query is where the model is at prediction. so essentially, the query has info about what has already been predicted. then you pass on all the info about the input as the key so the model learns what part of the input to focus on most when making prediction at the next step. you then multiply the attention scores with the value matrix which is again the input encoded. so essentially, the decoder takes the info on what has already been predicted and the full key matrix (input encoded) decides which parts of the input to pay attention to the most and once the attention scores are calculated, then the attention scores are weigh the encoded input. this is beautiful! then the mha might repeat for several iterations and then the output is added and normalized to the initial input of the decoder. 

* the input of the decoder will go through a masked multi-head attention. might repeat multiple times. then you add the initial input embeddings and encoding to the output of the multi-head loop (after you add the dropout layer to it). then this is inputed into another mha as the query. the key and the value are taken as the output of the encoder. another mha in a loop. then you add the dropout layer and then add to the query of this mha. then normalize and then run through a ffn. then again add dropout and add the input of ffn to the output.

there might be another linear map and the run through the softmax. and voila! 

ok so the first step is to modify our mha function. how? this model should take the query, key and values as inputs. previously, we would take the the input, and equal to the size of the input, we would calculate the query, key and value inside the mha. now take this calculation out. so the key, query and value will be defined outside the mha and inputed to reshape and cal attn scores. but note that this process must take place after the loop in the encoder is introduced. 

might also need to define a masked mha. 

in case it was needed, we can run our x matrix in the jupyter notebook of coursera and check if the outputs and inputs are the same and if one model performs differently than the other. 

In [None]:
#come back to the training which we have set to true for all code 

In [None]:
# ? would this be helpful for the task of sentiment analysis? I believe it should be. 

In [None]:
#change the padding of all 1s to a padding of all zeros and see how the performance of the model might change. 
# you might also be interested in applying a padding to the model to examine the improvment in the performance. 

In [None]:
#need to add training = training for all the dropouts applied so this will only occur during the training mode. not that right now, the model is 
#always in the training mode. no inference so the dropout layer is also applied during inference. 

There are multiple tasks that must be followed: 
1, build the decoder network from scratch. (today) 
2, build the transform's architecture (tom)
3, learn about the dropouts (tom)
4, learn about the masks (tom) 
5, apply the transformer to a task (2days each) 2 tasks (friday start this - sat done with one task) (sat - mon) finish the other task. 

In [66]:
class Decoder(tf.keras.layers.Layer): 

    def __init__(self, len_emb, dim_kv, dim_q, heads, 
                dd_model, iter, 
                drop_rate = 0.1, epsilon = 1e-6):  #dd_model is the number of neurons in the last layer of decoder (dense with softmax) 
        super(Decoder, self).__init__()
        self.len_emb = len_emb
        self.mha1 = MultiHeadAttention(dim_kv, dim_q, len_emb, heads) #remove the masking from the attributes and add it to the call argument) 
        self.mha2 = MultiHeadAttention(dim_kv, dim_q, len_emb, heads) #same for here 
        self.drop = Dropout(rate = drop_rate)
        self.layernorm = LayerNormalization(epsilon = epsilon)
        self.dense =  FullFeedForward(dd_model, len_emb) 
        self.iter = iter


#question! how does the built-in mha receive the number of q, k, v dims to map and create the q, k, v matrices? are the default. 
#question! during training will the layer normaliation parameters also train> if so, we need to define deperate layer norms to each. 
#question! there are some dense models in mha how are the number of neurons in them defined here? 


    def call(self, x, enc_output, training, look_ahead_mask, dec_pad_mask): 
        """

        """
    
        for _ in range(iter):
            
            # Add positional Encoding: #remove the pos embeddings and have it in hte transformer. 
            #x += pos_emb(x.shape[1], self.len_emb)
        
            # Add a dropout layer: 
            x = self.drop(x, training = training) 
            
            # Run through a MHA with the look-forward mask: 
            attn_mat1 = self.mha1(x, x, x, masking = look_ahead_mask)
            
            # Add dropout here during training:  
            attn_mat1 = self.drop(attn_mat, training = training)
            
            # Add and Normalize: 
            attn_mat1_x = self.layernorm(attn_mat1 + x)
            
            # Run through the next MHA: 
            attn_mat2 = self.mha2(x , enc_output, enc_output, masking = dec_pad_mask)
            
            # Add dropout during training: 
            attn_mat2 = self.drop(attn_mat2, training = training) 
            
            # Add and Normalize: 
            attn_mat2_x = self.layernorm(attn_mat2 +  attn_mat1_x) 
            
            # Run through a dense layer: 
            dense_output = self.dense(attn_mat2_x)
            
            # Add Dropout: 
            dense_drop = self.drop(dense_output, training = training)
            
            # Add and Normalize: 
            x = self.layernorm(dense_drop + attn_mat2_x) #x is the res but remember that since it's in a loop we still call it x. 
            
        return(x) 
            
        

In [68]:
len_emb = 50 
dim_kv = 30 
dim_q = 50 
heads = 3 
dd_model = 20 
iter = 3 
drop_rate = 0.1
function_decoder = Decoder(len_emb, dim_kv, dim_q, heads, 
                           dd_model, iter, drop_rate = 0.1, epsilon = 1e-6)
# Let's create an input to the decoder: note that the inputs now are difference and so are their embeddings. 
y  = tf.zeros((1,1,1))
#? won't know until I run an example for the decoder network. 
#function_decoder(y, output_encoder, training = True, look_ahead_mask = None, dec_pad_mask = None)

transformer: 
embeddings of the encoder and decoder should occur here but pos enc inside the encoder and decoder. 

In [None]:
class Transformer(tf.keras.layers.Layer): 

    def __init__(self, len_emb, dim_kv, dim_q, heads, d_model
                dd_model, iterEnc, iterDec, df_model, len_seq_out,
                drop_rate = 0.1, epsilon = 1e-6):
        
        super(Transformer, self).__init__()
        self.len_emb = len_emb
        self.len_seq_out = len_seq_out
        
        self.encoder = Encoder(dim_kv, dim_q, heads, d_model, len_emb, drop_rate = 0.1, iterEnc)
        
        self.decoder = Decoder(len_emb, dim_kv, dim_q, heads, dd_model, iterDec, drop_rate = 0.1, epsilon = 1e-6)
        
        self.dense =  Dense(units = df_model,activation = 'softmax') 
        
    def call(self, input_seqs, output_seqs, training, enc_pad_mask, dec_pad_mask, look_ahead_mask)
    
        """
        the output sequence and the input sequence must already be in the form of word embeddings added. we need two more paddings. <sos> and <eos> 
        len_seq in and out might be different 
        """
        
    #first pass the input embeddings to add the positional encodings no dropouts necessary as the encoder already has it: 
    len_seq = input_seqs.shape[1]
    input_seqs += pos_enc(len_seq_in, self.len_emb) 
    
    #multiply by a constant for numerical stability #look into it! 
    input_seqs *= tf.math.sqrt(tf.cast(self.len_emb,tf.float32))
    
    # Run through the encoder part: 
    enc_output = self.encoder(input_seqs, training = training, masking = enc_pad_mask)
    
    # Add positional encoding for the output sequence: 
    output_seqs += pos_enc(self.len_seq_out, self.len_emb)
    output_seqs *= tf.math.sqrt(tf.cast(self.len_emb,tf.float32))
    
    #Run through the decoder part: 
    dec_output = self.decoder(output_seqs, enc_output, training = training, look_ahead_mask = look_ahead_mask, dec_pad_mask = dec_pad_mask)
    
    # Run through a linear layer with activation function softmax 
    res = self.dense(dec_output) 
    return(re
    

        


before running through the final linear layer, do we add drop out to the model? 

My intuition is that when the output is not normalized, the algo will be caught in many local minima or maxima and cannot easily and quickly converge 

change the layer norms as they are also trainable. 