In [1]:
import numpy as np

def rmsnorm(input_tensor, weights):
    rms_value = np.sqrt(np.mean(input_tensor**2))
    normalized_tensor = input_tensor / rms_value
    weighted_normalized_tensor = weights * normalized_tensor
    return weighted_normalized_tensor

# input_a = np.array([5, 8, 7, 9, -2, 1, 0, 3])
# weights_g = np.array([0.1, 0.2, 0.05, 0.15, 0.1, 0.35, 0, 0.05])
# input_a = np.array([1, 2, 3, 4, 5])
# weights_g = np.array([0.1, 0.2, 0.3, 0.4, 0.5])

input_a = np.array([5, 8, 7, 9, -2, 1, 0, 3]) + np.array([5, 5, 5, 5, 5, 5, 5, 5])
weights_g = np.array([0.1, 0.2, 0.05, 0.15, 0.1, 0.35, 0, 0.05])

output = rmsnorm(input_a, weights_g)
print(output)

[0.10376493 0.26978881 0.06225896 0.21790635 0.03112948 0.21790635
 0.         0.04150597]


In [2]:
import numpy as np

def attention(x):
    n, d = x.shape
    Wq = np.random.rand(d, d)
    Wk = np.random.rand(d, d)
    Wv = np.random.rand(d, d)
    
    #compute queries, keys, values
    Q = np.dot(x, Wq)
    K = np.dot(x, Wk)
    V = np.dot(x, Wv)
    
    #calculate attention scores
    scores = np.dot(Q, K.T) / np.sqrt(d)
    
    #apply softmax to get the attention weights
    attention_weights = np.exp(scores) / np.sum(np.exp(scores), axis=1, keepdims=True)
    
    #weighted sum of values
    attention_output = np.dot(attention_weights, V)
    return attention_output

x = np.random.rand(10, 4)
output = attention(x)
output

array([[1.26623136, 1.26991938, 0.80545367, 0.6829949 ],
       [1.24198908, 1.24302783, 0.79473757, 0.67081707],
       [1.26720979, 1.26977609, 0.80499216, 0.68366085],
       [1.2262123 , 1.2267871 , 0.78715785, 0.66283928],
       [1.27206669, 1.28105231, 0.8091645 , 0.68552159],
       [1.27234947, 1.27676819, 0.80763497, 0.68616184],
       [1.26686396, 1.26917116, 0.80513724, 0.68354414],
       [1.23808574, 1.23624494, 0.79200041, 0.66916825],
       [1.24328208, 1.24271685, 0.79397839, 0.67173777],
       [1.28108688, 1.28503553, 0.81151524, 0.69063169]])

In [3]:
import numpy as np

def scaled_dot_product_attention(Q, K, V):
    d_k = Q.shape[1]
    scores = np.dot(Q, K.T) / np.sqrt(d_k)
    weights = np.exp(scores) / np.sum(np.exp(scores), axis=-1, keepdims=True)
    return np.dot(weights, V)

def multi_head_attention(x, head_n=16):
    n, d = x.shape
    assert d % head_n == 0, "Dimension must be divisible by the number of heads"
    d_k = d // head_n
    
    #initialize the weights
    Wq = np.random.rand(d, d)
    Wk = np.random.rand(d, d)
    Wv = np.random.rand(d, d)
    
    #linear transformations
    Q = np.dot(x, Wq).reshape(n, head_n, d_k)
    K = np.dot(x, Wk).reshape(n, head_n, d_k)
    V = np.dot(x, Wv).reshape(n, head_n, d_k)
    
    heads = np.concatenate([scaled_dot_product_attention(Q[:, i, :], K[:, i, :], V[:, i, :]) for i in range(head_n)], axis=-1)
    
    #another linear transformation for the output
    Wo = np.random.rand(d, d)
    output = np.dot(heads, Wo)
    
    return output

x = np.random.rand(10, 32)
output = multi_head_attention(x)
print(output)

[[152.89310592 139.01540984 157.00616999 154.17458983 179.12071845
  152.46563544 129.33416633 182.86497666 145.52328195 138.73135416
  166.82758555 129.94774183 152.46991469 163.08813098 136.50878431
  150.89314796 163.39261797 151.28590963 150.33222748 165.26774751
  156.21940788 143.07852477 152.49322381 104.0193102  169.14100661
  133.17807781 181.03874156 182.5139274  156.55275881 175.23870587
  149.81521697 139.19813471]
 [152.84967079 138.98997584 156.95389389 154.13447594 179.07781391
  152.4115376  129.29671843 182.79998179 145.49224186 138.6870352
  166.7803389  129.92286407 152.42832218 163.05047294 136.47580306
  150.86375085 163.35437332 151.24672399 150.27972855 165.23473987
  156.14997015 143.02152614 152.43052441 103.9976183  169.07909279
  133.13594741 180.98906365 182.43985628 156.52009411 175.21660646
  149.77236075 139.13260876]
 [153.06968073 139.1587932  157.24710103 154.30725075 179.23214561
  152.75194558 129.48642369 183.11675559 145.66264885 138.99551423
  167