#### Wednesday, March 20, 2024

This will be my example of the lessons learned in Chapter 2.

In [1]:
import numpy as np
from scipy.special import softmax

### Step 1: Represent the Input

This represents the token embeddings for every token in the sentence. 

In [2]:
print("Step 1: Input : 3 inputs, d_model=4")
x =np.array([[1.0, 0.0, 1.0, 0.0],   # Input 1
             [0.0, 2.0, 0.0, 2.0],   # Input 2
             [1.0, 1.0, 1.0, 1.0]])  # Input 3
print(x.shape)

Step 1: Input : 3 inputs, d_model=4
(3, 4)


### Step 2: Intializing the weight matrices

Random initialized weight matrics, where the dimensioins are driven by the number of inputs and the embedding size.

In [3]:
print("Step 2: weights 3 dimensions x d_model=4")
print("w_query")
w_query =np.array([[1, 0, 1],
                   [1, 0, 0],
                   [0, 0, 1],
                   [0, 1, 1]])
print(w_query.shape)

Step 2: weights 3 dimensions x d_model=4
w_query
(4, 3)


In [4]:
print("w_key")
w_key =np.array([[0, 0, 1],
                 [1, 1, 0],
                 [0, 1, 0],
                 [1, 1, 0]])
print(w_key.shape)

w_key
(4, 3)


In [5]:
print("w_value")
w_value = np.array([[0, 2, 0],
                    [0, 3, 0],
                    [1, 0, 3],
                    [1, 1, 0]])
print(w_value.shape)

w_value
(4, 3)


### Step 3: Matrix multiplication to obtain Q, K, and V

Here we multiply a (3x4) matrix (tokens x embeddings )by a (4x3) matrix of random numbers to produce a (3x3) matrix, a (token x token) matrix.

In [6]:
print("Step 3: Matrix multiplication to obtain Q,K,V")

print("Queries: x * w_query")
Q=np.matmul(x,w_query)
print(Q)
print(Q.shape)

Step 3: Matrix multiplication to obtain Q,K,V
Queries: x * w_query
[[1. 0. 2.]
 [2. 2. 2.]
 [2. 1. 3.]]
(3, 3)


In [7]:
print("Step 3: Matrix multiplication to obtain Q,K,V")

print("Keys: x * w_key")
K=np.matmul(x,w_key)
print(K)
print(K.shape)

Step 3: Matrix multiplication to obtain Q,K,V
Keys: x * w_key
[[0. 1. 1.]
 [4. 4. 0.]
 [2. 3. 1.]]
(3, 3)


In [8]:
print("Values: x * w_value")
V=np.matmul(x,w_value)
print(V)
print(V.shape)

Values: x * w_value
[[1. 2. 3.]
 [2. 8. 0.]
 [2. 6. 3.]]
(3, 3)


### Step 4: Scaled attention scores

In [9]:
print("Step 4: Scaled Attention Scores")
k_d=1   #square root of k_d simplified to 1 for this example
attention_scores = (Q @ K.transpose())/k_d
print(attention_scores)
print(attention_scores.shape)

Step 4: Scaled Attention Scores
[[ 2.  4.  4.]
 [ 4. 16. 12.]
 [ 4. 12. 10.]]
(3, 3)


#### (Softmax side note)



Compute the softmax function.

The softmax function transforms each element of a collection by computing the exponential of each element divided by the sum of the exponentials of all the elements. That is, if x is a one-dimensional numpy array:

    softmax(x) = np.exp(x)/sum(np.exp(x))

In [35]:
np.set_printoptions(precision=5)
someArray = np.array([[1, 0.5, 0.2, 3],
                      [1,  -1,   7, 3],
                      [2,  12,  13, 3]])

Compute the softmax transformation over the entire array.

In [36]:
m = softmax(someArray)
m

array([[4.48308990e-06, 2.71913148e-06, 2.01438214e-06, 3.31258028e-05],
       [4.48308990e-06, 6.06720242e-07, 1.80860755e-03, 3.31258028e-05],
       [1.21863018e-05, 2.68421160e-01, 7.29644362e-01, 3.31258028e-05]])

In [37]:
m.sum()

1.0

Compute the softmax transformation along the first axis (i.e., the columns).

>>> m = softmax(x, axis=0)

In [39]:
m =  softmax(someArray, axis=0)
m

array([[2.11941558e-01, 1.01299681e-05, 2.75393864e-06, 3.33333333e-01],
       [2.11941558e-01, 2.26030140e-06, 2.47261635e-03, 3.33333333e-01],
       [5.76116885e-01, 9.99987610e-01, 9.97524630e-01, 3.33333333e-01]])

In [40]:
m.sum(axis=0)

array([1., 1., 1., 1.])

Compute the softmax transformation along the second axis (i.e., the rows).

In [41]:
m = softmax(someArray, axis=1)
m

array([[1.05877070e-01, 6.42176889e-02, 4.75736340e-02, 7.82331607e-01],
       [2.42746030e-03, 3.28521027e-04, 9.79307378e-01, 1.79366403e-02],
       [1.22093673e-05, 2.68929212e-01, 7.31025390e-01, 3.31885014e-05]])

In [42]:
m.sum(axis=1)

array([1., 1., 1.])

### Step 5: Scaled softmax attention scores for each vector

Notice here we replace the attention_scores above with this ... 

In [10]:
print("Step 5: Scaled softmax attention_scores for each vector")
attention_scores[0]=softmax(attention_scores[0])
attention_scores[1]=softmax(attention_scores[1])
attention_scores[2]=softmax(attention_scores[2])
print(attention_scores[0])
print(attention_scores[1])
print(attention_scores[2])

Step 5: Scaled softmax attention_scores for each vector
[0.06337894 0.46831053 0.46831053]
[6.03366485e-06 9.82007865e-01 1.79861014e-02]
[2.95387223e-04 8.80536902e-01 1.19167711e-01]


In [13]:
print(attention_scores[0].sum())
print(attention_scores[1].sum())
print(attention_scores[2].sum())

1.0
0.9999999999999999
1.0


### Step 6: The final attention representations

In [14]:
print("Step 6: attention value obtained by score1/k_d * V")
print(V[0])
print(V[1])
print(V[2])
print("Attention 1")
attention1=attention_scores[0].reshape(-1,1)
attention1=attention_scores[0][0]*V[0]
print(attention1)

print("Attention 2")
attention2=attention_scores[0][1]*V[1]
print(attention2)

print("Attention 3")
attention3=attention_scores[0][2]*V[2]
print(attention3)

Step 6: attention value obtained by score1/k_d * V
[1. 2. 3.]
[2. 8. 0.]
[2. 6. 3.]
Attention 1
[0.06337894 0.12675788 0.19013681]
Attention 2
[0.93662106 3.74648425 0.        ]
Attention 3
[0.93662106 2.80986319 1.40493159]


### Step 7: Summing up the results

In [16]:
print("Step 7: summed the results to create the first line of the output matrix")
attention_input1=attention1+attention2+attention3
print(attention_input1)

Step 7: summed the results to create the first line of the output matrix
[1.93662106 6.68310531 1.59506841]


### Step 8: Steps 1 to 7 for all the inputs

In [17]:
print("Step 8: Step 1 to 7 for inputs 1 to 3")
#We assume we have 3 results with learned weights (they were not trained in this example)
#We assume we are implementing the original Transformer paper. We will have 3 results of 64 dimensions each
attention_head1=np.random.random((3, 64))
print(attention_head1)

Step 8: Step 1 to 7 for inputs 1 to 3
[[0.52353105 0.43351839 0.76328678 0.3449429  0.27203237 0.90431417
  0.69152242 0.09339724 0.47987107 0.44888206 0.1571273  0.2378133
  0.42837473 0.42222934 0.26950239 0.75302314 0.23953702 0.37005732
  0.73004836 0.16261291 0.45862454 0.0296148  0.24272412 0.27160843
  0.62116776 0.90029296 0.63565268 0.05190275 0.11249023 0.61936921
  0.36791575 0.17169599 0.93257257 0.69532045 0.87245015 0.88797099
  0.10272708 0.95414602 0.75713875 0.4566532  0.36617211 0.93953534
  0.38317323 0.73428086 0.98699469 0.38373678 0.69603399 0.67249243
  0.68768379 0.40616813 0.65463682 0.06796404 0.78165776 0.44594069
  0.13907781 0.92005932 0.63079154 0.90278733 0.24098882 0.40180616
  0.35490241 0.98548336 0.22918653 0.47448347]
 [0.70980247 0.26607188 0.94461129 0.39824171 0.24119117 0.94186068
  0.54883663 0.22678189 0.9166946  0.26838782 0.39178928 0.20079832
  0.24241162 0.50466058 0.13353808 0.50094631 0.56982031 0.99609219
  0.56090133 0.33937431 0.785118

### Step 9: The output of the heads of the attention sublayer

In [18]:
print("Step 9: We assume we have trained the 8 heads of the attention sub-layer")
z0h1=np.random.random((3, 64))
z1h2=np.random.random((3, 64))
z2h3=np.random.random((3, 64))
z3h4=np.random.random((3, 64))
z4h5=np.random.random((3, 64))
z5h6=np.random.random((3, 64))
z6h7=np.random.random((3, 64))
z7h8=np.random.random((3, 64))
print("shape of one head",z0h1.shape,"dimension of 8 heads",64*8)

Step 9: We assume we have trained the 8 heads of the attention sub-layer
shape of one head (3, 64) dimension of 8 heads 512


### Step 10: Concatenation of the output of the heads

In [19]:
print("Step 10: Concatenation of heads 1 to 8 to obtain the original 8x64=512 output dimension of the model")
output_attention=np.hstack((z0h1,z1h2,z2h3,z3h4,z4h5,z5h6,z6h7,z7h8))
print(output_attention)

Step 10: Concatenation of heads 1 to 8 to obtain the original 8x64=512 output dimension of the model
[[0.01631874 0.19172767 0.75296192 ... 0.30210663 0.51791097 0.26156163]
 [0.46704406 0.28731821 0.74328609 ... 0.29452945 0.08013049 0.55818484]
 [0.5968385  0.29230314 0.55923477 ... 0.06511621 0.72178965 0.90935397]]
