In [1]:
from nltk import WordPunctTokenizer
import numpy as np

In [2]:
inputs = 'Привет мир'

In [3]:
tokenzier = WordPunctTokenizer()

In [4]:
tokenzie_seq = tokenzier.tokenize(inputs.lower())

In [5]:
tokenzie_seq

['привет', 'мир']

In [6]:
embeddings_dim = 4

In [7]:
input_embeddings = np.random.rand(len(tokenzie_seq), embeddings_dim)

In [8]:
input_embeddings

array([[0.61591788, 0.87259507, 0.02572226, 0.77880353],
       [0.02423783, 0.44503231, 0.84729367, 0.3490508 ]])

In [9]:
def positional_encoding(input_embeddings):
    seq_len, embeddings_dim = input_embeddings.shape

    position = np.arange(seq_len)[:, np.newaxis]
    
    div_term = np.exp(np.arange(0, embeddings_dim, 2) * - (np.log(10000.0) / embeddings_dim))
    
    PE = np.zeros((seq_len, embeddings_dim))
    PE[:, 0::2] = np.sin(position * div_term) 
    
    PE[:, 1::2] = np.cos(position * div_term) 
    
    return PE

In [10]:
input_embeddings += positional_encoding(input_embeddings)

In [11]:
input_embeddings

array([[0.61591788, 1.87259507, 0.02572226, 1.77880353],
       [0.86570881, 0.98533462, 0.8572935 , 1.3490008 ]])

In [12]:
def create_heads(heads_numbers, heads_n, heads_m):
    WK = []
    WQ = []
    WV = []

    for _ in range(heads_numbers):
        WK_temp = np.random.randint(0, 2, size=(heads_n, heads_m))
        WQ_temp = np.random.randint(0, 2, size=(heads_n, heads_m))
        WV_temp = np.random.randint(0, 2, size=(heads_n, heads_m))

        WK.append(WK_temp)
        WQ.append(WQ_temp)
        WV.append(WV_temp)

    return np.array(WK), np.array(WQ), np.array(WV)

In [13]:
WK, WQ, WV = create_heads(2, 4, 3)

In [14]:
WK1, WK2 = WK[0], WK[1]
WQ1, WQ2 = WQ[0], WQ[1]
WV1, WV2 = WV[0], WV[1]

In [15]:
K1 = input_embeddings @ WK1
K1

array([[0.61591788, 1.89831733, 2.39472141],
       [0.86570881, 1.84262812, 2.21470961]])

In [16]:
V1 = input_embeddings @ WV1
V1

array([[1.87259507, 0.02572226, 2.42044367],
       [0.98533462, 0.8572935 , 3.07200312]])

In [17]:
Q1 = input_embeddings @ WQ1
Q1

array([[3.6513986 , 4.29303874, 4.29303874],
       [2.33433542, 4.05733774, 4.05733774]])

In [18]:
scores1 = Q1 @ K1.T
scores1

array([[20.6791433 , 20.57935604],
       [18.85606702, 18.48284428]])

In [19]:
scores1 = scores1 / np.sqrt(embeddings_dim)
scores1

array([[10.33957165, 10.28967802],
       [ 9.42803351,  9.24142214]])

In [20]:
def softmax(x, axis=1):
    return np.exp(x) / np.sum(np.exp(x), axis=axis, keepdims=True)

In [21]:
scores1 = softmax(scores1)
scores1

array([[0.51247082, 0.48752918],
       [0.54651793, 0.45348207]])

In [22]:
attention1 = scores1 @ V1
attention1

array([[1.44002971, 0.43113751, 2.73809791],
       [1.47023836, 0.40282491, 2.7159142 ]])

In [23]:
def attention(input_embeddings, WK, WQ, WV):
    K = input_embeddings @ WK
    Q = input_embeddings @ WQ
    V = input_embeddings @ WV
    
    scores = Q @ K.T
    scores = scores / np.sqrt(embeddings_dim)
    scores = softmax(scores)
    scores = scores @ V
    return scores

In [24]:
attention1 = attention(input_embeddings, WK1, WQ1, WV1)
attention1

array([[1.44002971, 0.43113751, 2.73809791],
       [1.47023836, 0.40282491, 2.7159142 ]])

In [25]:
attention2 = attention(input_embeddings, WK2, WQ2, WV2)
attention2

array([[1.47146546, 4.11216441, 1.19172126],
       [1.29878722, 4.14980252, 1.3334042 ]])

In [26]:
attentions = np.concatenate([attention1, attention2], axis=1)
attentions

array([[1.44002971, 0.43113751, 2.73809791, 1.47146546, 4.11216441,
        1.19172126],
       [1.47023836, 0.40282491, 2.7159142 , 1.29878722, 4.14980252,
        1.3334042 ]])

In [27]:
def multi_head_attention(input_embeddings, heads_numbers, heads_n, heads_m, W_n, W_m):
    WK, WQ, WV = create_heads(heads_numbers, heads_n, heads_m)

    attentions = []
    for i in range(len(WK)):
        WK_cur = WK[i]
        WQ_cur = WQ[i]
        WV_cur = WV[i]

        scores_cur = attention(input_embeddings, WK_cur, WQ_cur, WV_cur)
        attentions.append(scores_cur)

    W = np.random.rand(W_n, W_m)

    return np.concatenate(attentions, axis=1) @ W

In [28]:
Z = multi_head_attention(input_embeddings, 2, 4, 3, 6, 4)

In [29]:
Z

array([[5.26425771, 5.28549175, 3.01338395, 3.40271705],
       [5.32776873, 5.3597962 , 3.01425016, 3.43668358]])

In [30]:
def layer_norm(input_embeddings, epsilon=1e-6):
    mean = input_embeddings.mean(axis=-1, keepdims=True)
    std = input_embeddings.std(axis=-1, keepdims=True)
    
    return (input_embeddings - mean) / (std + epsilon)

In [31]:
layer_norm(input_embeddings)

array([[-0.58517879,  1.02276702, -1.34034693,  0.9027587 ],
       [-0.74406903, -0.14518267, -0.78619886,  1.67545056]])

In [32]:
def relu(x):
    return np.maximum(0, x)

In [33]:
def feed_forward(Z, W1, b1, W2, b2):
    return relu(Z.dot(W1) + b1).dot(W2) + b2

In [34]:
W1 = np.random.randn(4, 8)
W2 = np.random.randn(8, 4)
b1 = np.random.randn(8)
b2 = np.random.randn(4)

In [35]:
output_encoder = feed_forward(Z, W1, b1, W2, b2)
output_encoder

array([[-17.72807794, -18.82421232,  -6.29019925,  16.93018031],
       [-17.95186365, -19.00824948,  -6.25756062,  17.11355849]])

In [36]:
def encoder_layer(input_embeddings, heads_numbers, heads_n, heads_m, W_n, W_m):
    Z = multi_head_attention(input_embeddings, heads_numbers, heads_n, heads_m, W_n, W_m)

    W1 = np.random.randn(4, 8)
    W2 = np.random.randn(8, 4)
    b1 = np.random.randn(8)
    b2 = np.random.randn(4)

    output = feed_forward(Z, W1, b1, W2, b2)

    return layer_norm(output, Z)

In [37]:
output_encoder = encoder_layer(input_embeddings, 2, 4, 3, 6, 4)
output_encoder

array([[-0.46178134,  0.08327153,  1.20373282, -0.51715907],
       [-0.46077232,  0.08554531,  1.20272531, -0.51921085]])

In [38]:
def encoder(input_embeddings, n=6):
    for _ in range(n):
        input_embeddings = encoder_layer(input_embeddings, 2, 4, 3, 6, 4)

    return input_embeddings

In [39]:
output_encoder = encoder(input_embeddings)
output_encoder

array([[-0.83155033,  2.17455561, -3.01334648,  0.10501041],
       [-0.83155033,  2.17455561, -3.01334648,  0.10501041]])

In [40]:
sos_embedding = np.random.rand(len(tokenzie_seq), embeddings_dim) # start of the sequence

In [41]:
sos_embedding += positional_encoding(sos_embedding)
sos_embedding

array([[0.82912078, 1.92927165, 0.54918888, 1.92408717],
       [1.61825237, 1.09771491, 0.52366311, 1.66924314]])

In [42]:
decoder_self_attention = multi_head_attention(sos_embedding, 2, 4, 3, 6, 4)

In [43]:
decoder_self_attention

array([[9.66785665, 6.59645669, 7.7406441 , 6.94982241],
       [9.62822046, 6.55469488, 7.70815099, 6.91905955]])

In [44]:
decoder_self_attention = layer_norm(decoder_self_attention + sos_embedding)
decoder_self_attention

array([[ 1.68107402, -0.60374038, -0.87715946, -0.20017418],
       [ 1.68044953, -0.92652662, -0.50625207, -0.24767084]])

In [45]:
def encoder_decoder_attention(encoder_output, attention_input, WQ, WK, WV):
    K = encoder_output @ WK    
    V = encoder_output @ WV    
    Q = attention_input @ WQ   

    scores = Q @ K.T
    scores = scores / np.sqrt(embeddings_dim)
    scores = softmax(scores)
    scores = scores @ V
    return scores

In [46]:
def multi_head_encoder_decoder_attention(encoder_output, attention_input, heads_numbers, heads_n, heads_m, W_n, W_m):
    WK, WQ, WV = create_heads(heads_numbers, heads_n, heads_m)

    attentions = []
    for i in range(len(WK)):
        WK_cur = WK[i]
        WQ_cur = WQ[i]
        WV_cur = WV[i]
        
        scores_cur = encoder_decoder_attention(encoder_output, attention_input, WK_cur, WQ_cur, WV_cur)
        attentions.append(scores_cur)
    
    W = np.random.rand(W_n, W_m)
    concatenated_attention = np.concatenate(attentions, axis=1) @ W

    return concatenated_attention


In [47]:
Z_encoder_decoder = multi_head_encoder_decoder_attention(output_encoder, decoder_self_attention, 2, 4, 3, 6, 4)
Z_encoder_decoder

array([[-1.7473543 , -0.74219627,  0.01939987, -1.82984868],
       [-1.7473543 , -0.74219627,  0.01939987, -1.82984868]])

In [48]:
def decoder_layer(decoder_inputs, encoder_outputs, heads_numbers, heads_n, heads_m, W_n, W_m):
    Z = multi_head_attention(decoder_inputs, heads_numbers, heads_n, heads_m, W_n, W_m)

    W1 = np.random.randn(4, 8)
    W2 = np.random.randn(8, 4)
    b1 = np.random.randn(8)
    b2 = np.random.randn(4)

    output = feed_forward(Z, W1, b1, W2, b2)

    output = layer_norm(output, Z)

    Z_encoder_decoder = multi_head_encoder_decoder_attention(encoder_outputs, decoder_self_attention, 2, 4, 3, 6, 4)

    Z_encoder_decoder = layer_norm(Z_encoder_decoder + Z)

    output = feed_forward(Z_encoder_decoder, W1, b1, W2, b2)
    return layer_norm(output + Z_encoder_decoder)

In [49]:
output_decoder = decoder_layer(sos_embedding, output_encoder, 2, 4, 3, 6, 4)
output_decoder

array([[-1.19131474,  1.52620143,  0.14510851, -0.4799952 ],
       [-1.19027205,  1.52875507,  0.13804366, -0.47652668]])

In [50]:
def decoder(decoder_input, output_encoder, n=6):
    for _ in range(n):
        decoder_input = decoder_layer(decoder_input, output_encoder, 2, 4, 3, 6, 4)

    return decoder_input

In [51]:
decoder_outputs = decoder(sos_embedding, output_decoder)
decoder_outputs

array([[-0.07797279, -1.45344327,  1.36102441,  0.17039166],
       [-0.07797206, -1.45344378,  1.36102388,  0.17039196]])

In [52]:
def linear(x, W, b):
    return np.dot(x, W) + b

In [53]:
def output_probabilities(decoder_outputs, embeddings_dim, vocabulary_length):
    for decoder_output in decoder_outputs:
        linear_output = linear(decoder_output, np.random.randn(embeddings_dim, vocabulary_length), np.random.randn(vocabulary_length))

        softmax_output = softmax(linear_output, axis=0)

        print(softmax_output)

In [54]:
output_probabilities(decoder_outputs, embeddings_dim, 4)

[0.04761597 0.34187069 0.37068454 0.23982879]
[0.87256586 0.0350587  0.02668308 0.06569236]
