## Scaled dot-product attention



### Attention bare bone implementation

In [254]:
import numpy as np


SEQ_LEN   = 5
EMBD_LEN  = 10

# input matrix
x = np.random.normal(size=(SEQ_LEN,EMBD_LEN))

### attention score and feature enriching vectorized

In [256]:
pairwise_scoring = np.matmul(x,x.T)
enriched_x = np.matmul(pairwise_scoring, x)


### non-vectorized feature enriching

In [258]:
enriched_x_ = np.zeros(shape=x.shape)
# to compute the sum of all word vectors in the sentence, weighted by our relevancy scores

for i, scores in enumerate(pairwise_scoring):
    weighted_sum = np.zeros(x[0,:].shape)
    for j,score in enumerate(scores):
        weighted_sum+= x[j,:] * score
    enriched_x[i] = weighted_sum
    


### query,key,value model

In [259]:
vocabulary = ['a','b','c','d']
documents  = ['d1','d2', 'd3','d4','d5']

inverted_index = {('a','b'): ['d1','d2']
                 ,('a','c'): ['d1','d2','d3']
                 ,('b'): ['d1','d2']
                 ,('c','d'): ['d4','d5']
                 }

query = ('a','c')


results = set()
for key,value in inverted_index.items():
    match = set(query).intersection(set(key))
    if  len(match) >= 1:
        for document in value:
            results.add(document)

    

### projection of x into query,key,value matrices

In [260]:


# dimensions of q,k and v 
q_d = k_d = 20 # dimension of query and key weights
v_d = 25       # dimension of value matrix weight

# weight matrices
wq = np.random.normal(size=(q_d, EMBD_LEN))
wk = np.random.normal(size=(k_d, EMBD_LEN))
wv = np.random.normal(size=(v_d, EMBD_LEN))

print(f"x, input shape {x.shape}")

print(f"wq, q weight matrix shape {wq.shape}")
print(f"wk, k weight matrix shape {wk.shape}")
print(f"wv, v weight matrix shape {wv.shape}")



x, input shape (5, 10)
wq, q weight matrix shape (20, 10)
wk, k weight matrix shape (20, 10)
wv, v weight matrix shape (25, 10)


In [261]:
# projection operation
wqp = np.matmul(wq,x.T).T
wkp = np.matmul(wk,x.T).T
wvp = np.matmul(wv,x.T).T

print(f"wqp, q weight matrix shape {wqp.shape}")
print(f"wkp, k weight matrix shape {wkp.shape}")
print(f"wvp, v weight matrix shape {wvp.shape}")


wqp, q weight matrix shape (5, 20)
wkp, k weight matrix shape (5, 20)
wvp, v weight matrix shape (5, 25)


In [262]:
# score calculation
def softmax(x):
    e_x = np.exp(x)
    return e_x / e_x.sum(axis=1,keepdims=True)

score = np.matmul(wqp, wkp.T)

print(f"score shape {score.shape}")
scaled_score = score / np.sqrt(wkd)
scaled_softmax_score = softmax(scaled_score)


score shape (5, 5)


In [263]:
context_vector = np.sum(np.matmul(scaled_softmax_score, wvp),axis=0)
context_vector.shape

(25,)