The objective of this project is to: 
* Create positional encodings to capture sequential relationships in data
* Calculate scaled dot-product self-attention with word embeddings
* Implement masked multi-head attention
* Build and train a Transformer model

In [130]:
# Loading the required packages: 
import tensorflow as tf
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tensorflow.keras.layers import Embedding, MultiHeadAttention, Dense, Input, Dropout, LayerNormalization
from tensorflow.keras.models import Sequential
from tensorflow import  reshape, shape, transpose

from transformers import DistilBertTokenizerFast #, TFDistilBertModel
from transformers import TFDistilBertForTokenClassification


from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import PorterStemmer,LancasterStemmer
import re

from sklearn.model_selection import train_test_split 

In [2]:
# Calculate the angles for positional embeddings: 
def get_angles(pos, k, d):
    """
    Get the angles for the positional encoding
    
    Arguments:
        pos -- Column vector containing the positions [[0], [1], ...,[N-1]]
        k --   Row vector containing the dimension span [[0, 1, 2, ..., d-1]]
        d(integer) -- Encoding size
    
    Returns:
        angles -- (pos, d) numpy array 
    """
    
    # Get i from dimension span k
    i = k//2
    # Calculate the angles using pos, i and d
    angles = pos/ (10000)**(2*i/d)

    
    return angles

In [3]:
#Write down your own positional embedding: 
#what do we need? we need to first calculate the angles and then pass those to cal cosine and sine and save into the pos embs. 

In [4]:
#so how to calculate the angles? for the angles we require position i, and d. 

In [5]:
def pos_emb(len_seq, len_emb): 
    
    """
    This function creates the positional embeddings for all the words in the sequence based on: 
    
    Input: 
    len_seq (int) : The length of the sequences inputed into the model. 
    len_emb (int) : The length of the word embeddings for every word in the sequence. 

    Note: the size of the positional encoding and the word embeddings must match in order to add them in the next step. 

    Output: 
    res (np.array(len_seq, len_emb)) : ith row of this matrix represents the positional encodings for the ith position in the sequence. 

    """

    len_i = int(len_emb/2)

    # Initialize the matrix to save positional encodings: 
    res = np.zeros((len_seq,len_emb))
    angles = np.zeros((len_seq,len_emb))
    
    #for each position in the sequence 
    for pos in range(len_seq): 
        
        #calculate the angles: 
        for i in range(len_i): 
            angles[pos,2*i] = pos/(10000**(2*i/len_emb))
            angles[pos, 2*i +1] = pos/(10000**(2*i/len_emb)) 
        
        # Calculate the entries corresponding to each position 
        #for j in range(len_i): 
        res[pos, 0::2] = np.sin(angles[pos,0::2])
        res[pos,1::2] = np.cos(angles[pos,0::2])
            
    return(tf.cast(res.reshape(1,len_seq,len_emb), dtype=tf.float32))

In [6]:
pos_emb(4,8)

<tf.Tensor: shape=(1, 4, 8), dtype=float32, numpy=
array([[[ 0.0000000e+00,  1.0000000e+00,  0.0000000e+00,  1.0000000e+00,
          0.0000000e+00,  1.0000000e+00,  0.0000000e+00,  1.0000000e+00],
        [ 8.4147096e-01,  5.4030228e-01,  9.9833414e-02,  9.9500418e-01,
          9.9998331e-03,  9.9994999e-01,  9.9999981e-04,  9.9999952e-01],
        [ 9.0929741e-01, -4.1614684e-01,  1.9866933e-01,  9.8006660e-01,
          1.9998666e-02,  9.9980003e-01,  1.9999987e-03,  9.9999797e-01],
        [ 1.4112000e-01, -9.8999250e-01,  2.9552022e-01,  9.5533651e-01,
          2.9995501e-02,  9.9955004e-01,  2.9999956e-03,  9.9999553e-01]]],
      dtype=float32)>

We want the Softmax function that assigns the attention scores to avoid assigning any attention score to the padded parts of the sequence. So, instead we can either define a function that replaces vectors of all zeros with negative infinity (-1e-9) or when creating the padded embeddings for each input, we can assign -1e-9 to every padded token. But if we add the padding before going through the dot product attention (before the softmax), it is possible that through multiplication with matrices q,k, and v the padded vectors grow larger and then when we run the resultant matrix through softmax, it might again not assign 0 attention scores to the padded sequences. Therefore, the padded mask must be added after the dot product. Then apply Softmax, then multiply with the V matrix. Where to normalize? we will normalize the attention scores after the dot product before masking is applied. 

In [7]:
# Encoding the input with Glove Word Embeddings: 
def gvec_input(x,m,e): 
    """
    
    This function takes any input, x, and returns a glove vector based on the 
    words introduced in the vocabulary (400,000 words). This function returns k vectors where k is the number of words in the 
    sentence. Every vector corresponds to a word in the dictionary and each entry will describe a feature of the word. 
    
    inputs: 
    
    x (string) : a statement from customers. 
    m (int)    : size of the sequence 
    e (int)    : size of the embeddings 
    outputs: 
    v (m,n)    : where m is the number of words in the sentence and n = 50 is the number of total features describing a word. 

    
    """
    n = len(x)
    gv = np.zeros((n,m, e))
    
    for i in range(0, n): #looping over each comment 
        txt = x[i] #select the ith comment  
        txt = (txt[:m] if len(txt) > m else txt + ['<pad>'] * (m - len(txt))) #shorten or add extra padding
        for l in range(m): #looping over each word 
            
            # add the embedding of all ones for pads
            if txt[l] == "<pad>": 
                gv[i,l,:] = np.ones(e) 
                
            # if a word is not is the list of Glove embeddings, then assign an array which is the average of all embeddings:    
            elif txt[l] not in words: 
                gv[i,l,:] = np.mean(vectors, axis = 0)
            # add the word embeddings: 
            else: 
                gv[i,l,:] = embeddings_dict[txt[l]]
    return(gv)

In [8]:
#Loading the data: 
CustomerFeed = 'Canva_reviews.xlsx'
df = pd.read_excel(CustomerFeed)

print(df)

                                               reviewId            userName  \
0     gp:AOqpTOFxf3fttcT5DSvFIn9KPp5FErgH9yC533Fmoxv...      Donna Caritero   
1     gp:AOqpTOEq6rNIWLnPV4KFTctWvm0mpGEQljtD6mvy1H-...  Soumi Mukhopadhyay   
2     gp:AOqpTOE86hSyPRHZgYt28Uk5zGe4FZGb1hkmtFDiYJ2...   Theknown _unknown   
3     gp:AOqpTOHSuKkVTcM3QgCCKysHQlxEnk2ocOKsUMiMIJy...        Anthony Dean   
4     gp:AOqpTOEOrZt5H6jXPiplJyffCd5ZBnVXACTWgwNsF1R...   Neha Diana Wesley   
...                                                 ...                 ...   
1495  gp:AOqpTOHhnXMpylU3f-1V1KbR2hwWArOilxPlKI6K4xY...            Reen Ali   
1496  gp:AOqpTOEcz62DHS-amqTB5xGMhM4_R0UJpcv_HDNny9i...     Shaurya Chilwal   
1497  gp:AOqpTOFMqEqa_kpp29Q8wjcBmKUCAvOQGQx4KZQ8b83...           GK Gaming   
1498  gp:AOqpTOGY4z3pUxeiqGzn2ad3Noxqlbm-9DZ3ksHqD1_...    1203_Vani Sharma   
1499  gp:AOqpTOFVGZ0MXyR-Gv_d2cYf2KD709Hwple_u7OZE4y...           MeLLy EcK   

                                              userI

In [9]:
df = df[["review", "Sentiment"]]
df.head()

Unnamed: 0,review,Sentiment
0,Overall it's really an amazing app. I've been ...,Negative
1,Hey! Yes I gave a 5 star rating... coz I belie...,Positive
2,Canva used to be a good app! But recently I've...,Negative
3,"It's a brilliant app, but I have just one prob...",Negative
4,This was such a great app. I used to make BTS ...,Negative


In [10]:
def edit_txt(review):
    """
    This function receives a text and returns it edited as follows: 
    1, all words converted to lower case 
    2, integers removed
    3, tokenize the words 
    4, punctuation removed 
    5, common words that are unnecessary are removed. 
    """
    
    review_edited = []

    #Converting to lower case: 
    review_edited = review.lower() 
    
    #Removing integers: 
    pattern = r'[0-9]'
    # Match all digits in the string and replace them with an empty string
    review_edited = re.sub(pattern, '', review_edited) 

    #Tokenize the comment: 
    review_edited = word_tokenize(review_edited) 

    #Removing punctuation 
    tokenizer = RegexpTokenizer(r'\w+')
    review_edited = [''.join(tokenizer.tokenize(word)) for word in review_edited if len(tokenizer.tokenize(word))>0]

    #Removing common words: 
    remove_list = stopwords.words('english') 
    to_remove = [ "not",'don',"don't",'should',"should've", 'ain','aren',"aren't",'couldn',"couldn't",'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',"hasn't",'haven',"haven't",'isn',"isn't",'mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',"shan't",'shouldn',"shouldn't",'wasn',"wasn't",'weren',"weren't",'won',"won't",'wouldn', "wouldn't"]
 
    review_edited = [word for word in review_edited if not word in remove_list]
    return(review_edited) 


In [11]:
# Extract the reviews: 
x = df["review"] 

#Modify the text to test the function reviews_edited: 
reviews_edited = [edit_txt(review) for review in x]
print(reviews_edited[13])
print(x[13])

# Define the target dataset and extract the unique rankings: 
y = df["Sentiment"].tolist()
ranking = np.unique(y)
ranking = ranking.tolist()
ranking

['unable', 'save', 'work', 'nothing', 'works']
Unable to save my work. Nothing works :(


['Negative', 'Positive']

### <font color = "red"> Do we need the following? </font>

In [12]:
# Creating the dictionary: 
Split = [] 
Dic = []
dictionary = np.unique([word for review in reviews_edited for word in review]).tolist()

# Add extra padding to limit the length of the input: 
dictionary = dictionary + ["<pad>"]
dictionary[1:10]

['aa',
 'aap',
 'ability',
 'able',
 'absolutely',
 'acc',
 'accepted',
 'access',
 'accessibilities']

In [13]:
# Split the dataset into training and testing datasets: 
#x = x.to_list()
X_train, X_test, y_train, y_test = train_test_split(x,y, 
                                   random_state=104,  
                                   test_size=0.25,  
                                   shuffle=True) 

In [14]:
# Apply the edit_txt function to both text corpus: 
X_train = [edit_txt(comment) for comment in X_train]
X_test = [edit_txt(comment) for comment in X_test]

In [15]:
# Load the word embeddings (Glove word embeddings) 
embeddings_dict = {}
with open("glove.6B.50d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

words =  list(embeddings_dict.keys())
vectors = [embeddings_dict[word] for word in words]

In [30]:
# Limit the length of the sequence: 
m = 30 
# The length of the embeddings: 
e = 50
X_trainmod = gvec_input(X_train,m,e) 
X_testmod = gvec_input(X_test,m,e)

In [31]:
print(X_trainmod[0])
X_trainmod.shape

[[ 0.79238999  0.21864     0.68711001 ... -0.066753   -0.39660001
   0.74818999]
 [ 0.36998999  0.082841    0.16883001 ...  0.0053184  -0.50853002
   0.24986   ]
 [ 0.02648     0.33737001  0.065667   ... -0.3398     -0.23043001
   0.19069   ]
 ...
 [ 1.          1.          1.         ...  1.          1.
   1.        ]
 [ 1.          1.          1.         ...  1.          1.
   1.        ]
 [ 1.          1.          1.         ...  1.          1.
   1.        ]]


(1125, 30, 50)

In [18]:
def padding_mask(tensor): 
    """
    this function will encode the padded sequences as -1e-9 so that when they are run through a Softmax, the value will be equal to zero. 
    """
    # Identify rows where all elements are zero
    is_zero_row = tf.reduce_all(tf.equal(tensor, 0), axis=1)

    # Expand is_zero_row to match the shape of tensor
    is_zero_row_expanded = tf.expand_dims(is_zero_row, axis=-1)

    # Replace zeros with -1e-9 where the row is all zeros
    result_tensor = tf.where(is_zero_row_expanded, tf.constant(-1e-9, dtype=tf.float64), tensor)
    return(result_tensor)


In [19]:
#now time to define the self_attention: 

In [149]:
def self_attention(q,k,v, masking):
    """
    this function calculates a self_attention mechanism 
    res are the final attention scores. 
    """
    
    
    # Perform matrix multiplication on the last two dimensions
    dotqk = tf.matmul(q, k, transpose_b = True)

    dim_k = tf.cast(k.shape[-1],tf.float32)
    normalized_dotqk = dotqk/tf.math.sqrt(dim_k)
    
    #then add the masking if masking if given" 
    if masking is not None: 
        normalized_dotqk += (1 - masking)* (-1e9)
    
    attention_scores =  tf.nn.softmax(tf.cast(normalized_dotqk, dtype=tf.float32),axis = -1)
    res = tf.matmul(attention_scores,v) 
    
    return(res)
    

In [142]:
print(dense_q.shape)
print(dense_k.shape)
print(dense_v.shape)

(1125, 30, 20)
(1125, 30, 30)
(1125, 30, 30)


In [176]:
#reshape the q, k, v matrices: 
reshaped_q = reshape_tensor(dense_q, 2, pre_attention = True)
reshaped_k = reshape_tensor(dense_k, 2, pre_attention = True) 
reshaped_v =  reshape_tensor(dense_v, 2 ,pre_attention  = True) 

print(reshaped_q.shape)
print(reshaped_k.shape)
print(reshaped_v.shape)

(1125, 2, 10, 30)
(1125, 2, 15, 30)
(1125, 2, 15, 30)


In [136]:
q_transposed = tf.transpose(reshaped_q, perm=[0, 2, 3, 1])  # shape: (#sample, #head, 30, 10)
k_transposed = tf.transpose(reshaped_k, perm=[0, 2, 1, 3])
print(q_transposed.shape)
print(k_transposed.shape)

(1125, 30, 10, 2)
(1125, 30, 2, 15)


In [177]:
#calculate the attention scores: 
attention_scores = self_attention(reshaped_q, reshaped_k, reshaped_v, None)
attention_scores.shape

TensorShape([1125, 2, 10, 30])

In [None]:
reshape_tensor(attention_scores, heads, pre_attention = False).shape

In [23]:
mask = np.array([[[1, 1, 0, 1], [1, 1, 0, 1], [1, 1, 0, 1]]])

In [24]:
self_attention(q,k,v,None)

<tf.Tensor: shape=(3, 4), dtype=float32, numpy=
array([[0.2589478 , 0.42693272, 0.15705977, 0.15705977],
       [0.2772748 , 0.2772748 , 0.2772748 , 0.16817567],
       [0.33620113, 0.33620113, 0.12368149, 0.2039163 ]], dtype=float32)>

In [25]:
self_attention(q,k,v,mask)

<tf.Tensor: shape=(1, 3, 4), dtype=float32, numpy=
array([[[0.3071959 , 0.5064804 , 0.        , 0.18632373],
        [0.38365173, 0.38365173, 0.        , 0.23269655],
        [0.38365173, 0.38365173, 0.        , 0.23269655]]], dtype=float32)>

### <font color="red"> Review masked functions</font>

Preferably, we want the input of the Encoder structure to already have the word embeddings and the positional encodings. In the Encoder structure, we will have the multi-head attention (think of it as running the self-attention multiple times) and a fully connected neural network which will be called FullFeedForward. 

In [26]:
def FullFeedForward(n_1, emb_size):#the model must return vectors of the same size as the embeddings of the input so can be combined with decoder
    model = Sequential([
    Dense(n_1, activation='tanh', name="dense1"), #relu? (#samples, len_seq, n_1)
    Dense(emb_size, activation='tanh', name="dense2")# linear? (#samples, len_seq, emb_size)
])
    return(model)
    

# Questions
Why is the embedding size also taken as an argument in MHA? we get matrices q, k, and v. The product of qTk will give a dim_k or dim_q by emb_size. The final product in the attention mechanism must yield a matrix of the same length of seq and emb_size. 

* look into the command of MHA.
* LayerNormalization.

### Multi-head attention? 
We will input 3 xs (possibly they could be different?) then the inputs are mapped linearly to give us the matrices Query, Key and Value. 
* dimension x (#batches, len_seq, len_emb)
* dim of k:$K^T x$ if k is (len_seq,dim_k), then its transpose is (dim_k, len_seq), the resultant matrix is going to have dim (dim_k, len_emb)
* dim of q: $Q^T x $; if q is (len_seq,dim_q), then its transpose is of dim (dim_q, len_emb) and the resultant dot product gives (dim_q,len_emb)
* Similarly, for the multiplication of $V^T x$, we have the value being of dimension (dim_v, len_emb).
  * if it is a self-attention (attention with only one head), then $qk^T$ has dim (dim_q, dim_k), scale, add the mask and dropout if given.
  * if it has n heads, then we will produce query and key matrices of dimensions dim_q/n, dim_k/n. After the dot product, the result is of dim (dim_q/n, dim_k/n). We then concatenate these results to get the desired dim of (dim_q,dim_k). $ \bold{make sure you understand the concatenation} $
* dot prodcut v (dim_v, len_emb) qTk (dim_q, dim_k) --> $ qTk .v $ Note that here dim_k must be the same as the dimension of v for this dot product to occur.
* just like magic, you have the attention scores now and the result is a matrix of (dim_k, len_emb).
* so then we add our initial x and normalize too. in order to add x to the attention scores, the attention scores need to have the same dim as x. meaning that dim_k needs to be the same as the len of the sequence.

### Fully Connected Neural Network: 

We feed the matrix out of the attention mechanism into the fully connected neural network. how many neurons? what matters is that the output layer must have len_emb neurons in order to match the dim of x. why do we need them to match? becoz we again add the input seq x to the result (after another layer of normalization). 

Then copy the result, pass as key and value to the decoder network. 

# Question isn't the dot product we are talking here actually a cross product?!

In [173]:
def reshape_tensor(q_matrix, heads, pre_attention): 
    
    #pre_attention, we'll need to reform into 4d 
    if pre_attention:

        dense_qre = reshape(q_matrix, (shape(q_matrix)[0], shape(q_matrix)[1], heads, -1))
        dense_qre = transpose(dense_qre, ([0, 2, 3, 1]))
        
        
    #post_attention, we'll need to revert back to 3d: 
    else: 
        q_matrix_transpose = transpose(q_matrix, ([0,3,1,2]))
        dense_qre = reshape(q_matrix_transpose, (shape(q_matrix_transpose)[0], shape(q_matrix_transpose)[1], -1)) 
        
        
    return(dense_qre)
        

In [210]:
from tensorflow.keras.layers import Layer

class MultiHeadAttention(Layer):  # Ensure this name matches in `super()`
    def __init__(self, dim_kv, dim_q, len_emb, heads, masking, **kwargs):
        
        super(MultiHeadAttention, self).__init__(**kwargs)  # Ensure the class name here is correct
        self.dim_k = self.dim_v = dim_kv
        self.dim_q = dim_q
        self.heads = heads
        self.masking = masking
        self.d_model = len_emb

    
    def call(self,x,**kwargs): #by passing self, you passed all the attributes you've defined above. 
        dense_q = Dense(units = self.dim_q)(x) # shape = (#samples, len_seq, dim_q)
        dense_k = Dense(units = self.dim_k)(x) # shape = (#samples, len_seq, dim_k) 
        dense_v = Dense(units = self.dim_v)(x) # shape = (#samples, len_seq, dim_v) 
        
        # Reshape: 
        dense_qre = reshape_tensor(dense_q, heads, pre_attention = True) #shape = (#samples, #heads, dim_q/heads, len_seq)
        dense_kre = reshape_tensor(dense_k, heads, pre_attention = True) #shape = (#samples, #heads, dim_k/heads, len_seq)
        dense_vre = reshape_tensor(dense_v, heads, pre_attention = True) #shape = (#samples, #heads, dim_v/heads, len_seq) 
        
        # Calculate the attention scores: 
        attention_scores = self_attention(dense_qre, dense_kre,dense_vre, masking) #shape = (#samples, #heads, dim_q/heads, len_seq)
        
        # Revert the shape:
        attention_with_v = reshape_tensor(attention_scores, heads, pre_attention = False) #shape = (#samples, len_seq, dim_q)
        
        # Run through another dense and add to the initial x: 
        res = Dense(units = self.d_model)(attention_with_v) + x  # shape = (#samples, len_seq, d_model) 
        #how to add the dropout and the normalization layers? 
        return(res)


In [212]:
dim_kv = 30 
dim_q = 20 
len_emb = 50
heads = 2 

masking = None 

function = MultiHeadAttention(dim_kv, dim_q, len_emb, heads, None)
function(X_trainmod).shape 

TensorShape([1125, 30, 50])

In [None]:
class EncoderLayer (tf.keras.layers.Layer): #what does this mean? this means that the class we are defining is child to the 
                                            #parent environment keras.layers.Layer. so when Layer is loaded this will be also loaded? no this 
                                            #will be defined in this environment. 
    #initialize the attributes? what are the attributes of the encoder layer? 
    #how do we initialize the q, k, and v matrices? 
    #how is this model trained? 
    
    #is the x being fed here with the embeddings and positional encodings? 
    def __init__(self, num_head, emb_size, n_1, var_norm = , var_drop) #n_1 is the number of neurons in the ffn. 
    #don't know what the integer var_norm does. 

    #then define the multi head attribute of this class so when called this will be a function: #read about the MHA command in python
    self.mha = MultiHeadAttention(num_head = num_head, emb_size = emb_size, var_drop)#, #what else do we need
    #define the fully forward neural network 
    self.ffn = FullFeedForward(n_1, emb_size) #why do we need the emb_size? #the output when gone through must have 
                                              #the same dims as the input. so if n by emb_size is given, then n by emb_size must 
                                              #be outputted. therefore, both for the mha and the ffn, the last layer needs to take care         
                                             #of the dim of the output and therefore needs the emb_dim. 
    #define the layer normalizations. what happens if we only define one layer and apply it twice? 
    #get to know the command layer normalization. 
    self.NormalizedLayer1 = LayerNormalization(var_norm) #don't know other inputs required. 
    self.NormalizedLayer2 = LayerNormalization(var_norm) #don't know other inputs required. 
    #we also define a dropout function here. what is its role? 
    #where do we define the dropout and why where is it automatic and where should we define it? does it really make a differnce? 
    #dropout will be applied to both layers one to the attention layer and one to the ffn. multi-head already has the command built-in. 
    self.dropout = Dropout(var_drop) #don't know other inputs reuqired. 
    

My intuition is that when the output is not normalized, the algo will be caught in many local minima or maxima and cannot easily and quickly converge 

In [None]:
class EncoderLayer(tf.keras.layers.Layer): #here the brackets mean that this class will be part of the layers in keras and can be used like 
                                        #any other layer like Dense or multi-head. 

    #specifies some attributes in brackets: 
    def __init__(self, embedding_dim, num_heads, fully_connected_dim,
                 dropout_rate=0.1, layernorm_eps=1e-6): 
        super(EncoderLayer, self).__init__() #this specifies that EncoderLayer will have all the attributes of layer.
    #now define the multi-head: 
    self.mha = MultiHeadAttention(num_heads=num_heads,
                                      key_dim=embedding_dim,
                                      dropout=dropout_rate)

    self.ffn = FullyConnected(embedding_dim=embedding_dim,
                                fully_connected_dim=fully_connected_dim)

    self.layernorm1 = LayerNormalization(epsilon=layernorm_eps)
    self.layernorm2 = LayerNormalization(epsilon=layernorm_eps)

    self.dropout_ffn = Dropout(dropout_rate)