In [1]:
import tensorflow as tf
import keras
import numpy as np

In [2]:
# Self attention mechanism steps : 1. Calculating the attention scores by dot product of the target word vector with each word vector present in the sentence
# 2. Calculating the sum of all the word vectors present in the sentence by their relevance score(attention scores). This will create our new target word vector, which will also contain 
# information about the surrounding of the target word

In [3]:
from keras.activations import softmax

In [7]:
# Self attention mechanism
# Steps performed : Step 1: Initialing the output array
# Step 2 : Iterating through the input sequence so each word can once be the target word
# Step 3 : "scores" is the array that will contain the attention scores for target word with each word vector present in the sequence - Initialized 
# Step 4 : If input sequence is [[1,2,3],[4,5,6],[7,8,9]] , then
#For the first token [1, 2, 3]: -- Target token
#Dot product with itself: 1*1 + 2*2 + 3*3 = 1 + 4 + 9 = 14
#Dot product with the second token [4, 5, 6]: 1*4 + 2*5 + 3*6 = 4 + 10 + 18 = 32
#Dot product with the third token [7, 8, 9]: 1*7 + 2*8 + 3*9 = 7 + 16 + 27 = 50
#So, the attention scores for the first token [1, 2, 3] are [14, 32, 50].
# Step 5 : Once we get the attention scores, we scale them using np.sqrt and normalize them using softmax activation
# Step 6 : We get the weighted vectors of the attention scores as the new vector representation
# Target token --> [1,2,3]
# Attention scores --> [14,32,50] --> sum = 14 + 32 + 50 = 96   
# Therefore, weighted vector_1 of 1st token --> [14/96,32/96,50/96] => [1/6, 1/3, 1/2]
# Similarly, weighted vector_2 of 1st token --> [8/24, 10/24, 12/24]
# Similarly, weighted vector_3 of 1st token --> [35/96, 40/96, 45/96]

# Therfore, the new pivot representation will be [1/6 + 8/24 + 35/96, 1/3 + 10/24 + 40/96, 1/2 + 12/24 + 45/96]  -> For 1st token

# Similarly, the new pivot representation will be calculated for the next word in the input sequence as the target token
# Input sequence is a sequuence with vector representation of the words
def self_attention(input_sequence):
    output = np.zeros(shape=input_sequence) # Initializing the output
    for i , pivot_vector in enumerate(input_sequence): # pivot_vector is each token in the sentence ( Each token in the sentence will be a target token once)
        scores = np.zeros(shape=(len(input_sequence),)) # Initializing the scores (Depends on the number of tokens in input_sequence)
        for j,vector in enumerate(input_sequence):
            scores[j] = np.dot(pivot_vector,vector.T) # Used for computing the attention scores as a dot product between the pivot vector and the remaining vectors (vector.T means transpose )
        scores /= np.sqrt(input_sequence.shape[1]) # Scaling 
        scores = softmax(scores) # Softmax activation function applied
        new_pivot_representation = np.zeros(shape=pivot_vector.shape)
        for j,vector in enumerate(input_sequence):
            new_pivot_representation += scores[j] * vector # This vector representation of each word will contain the information about the surrounding words as well
        output[i] = new_pivot_representation
    return output

### Calculation behind new pivot representation
For the first token [1, 2, 3]:

Weight for the first vector [1, 2, 3]:

    weight_1 = 14 / (14 + 32 + 50 + 68) = 14 / 164 ≈ 0.0854

Weight for the second vector [4, 5, 6]:

    weight_2 = 32 / (14 + 32 + 50 + 68) = 32 / 164 ≈ 0.1951

Weight for the third vector [7, 8, 9]:

    weight_3 = 50 / (14 + 32 + 50 + 68) = 50 / 164 ≈ 0.3049

Weight for the fourth vector [10, 11, 12]:

    weight_4 = 68 / (14 + 32 + 50 + 68) = 68 / 164 ≈ 0.4146

Weighted vector for the first vector [1, 2, 3]:

    weighted_vector_1 = [0.0854, 0.1708, 0.2562]

Weighted vector for the second vector [4, 5, 6]:

    weighted_vector_2 = [0.7804, 0.9756, 1.1707]

Weighted vector for the third vector [7, 8, 9]:

    weighted_vector_3 = [2.1348, 2.4393, 2.7439]

Weighted vector for the fourth vector [10, 11, 12]:

    weighted_vector_4 = [4.1557, 4.5713, 4.9868]


Sum of weighted vectors:

    [0.0854 + 0.7804 + 2.1348 + 4.1557, 
     0.1708 + 0.9756 + 2.4393 + 4.5713,
     0.2562 + 1.1707 + 2.7439 + 4.9868]

New pivot representation:

    [6.4563, 7.6974, 9.9377]

weight_2 = 32 / (14 + 32 + 50 + 68) = 32 / 164 ≈ 0.1951 

Vector_2 = [4 , 5 , 6]

Weighted vector for the second vector:
[4 * 0.1951, 5 * 0.1951, 6 * 0.1951]