In [1]:
from nltk import WordPunctTokenizer
import numpy as np

In [2]:
inputs = 'Привет мир'

In [3]:
tokenzier = WordPunctTokenizer()

In [4]:
tokenzie_seq = tokenzier.tokenize(inputs.lower())

In [5]:
tokenzie_seq

['привет', 'мир']

In [6]:
embeddings_dim = 4

In [7]:
input_embeddings = np.random.rand(len(tokenzie_seq), embeddings_dim)

In [8]:
input_embeddings

array([[0.35440833, 0.4533334 , 0.40693244, 0.0910958 ],
       [0.60112991, 0.62932783, 0.05967745, 0.48236352]])

In [9]:
def positional_encoding(input_embeddings):
    seq_len, embeddings_dim = input_embeddings.shape

    position = np.arange(seq_len)[:, np.newaxis]
    
    div_term = np.exp(np.arange(0, embeddings_dim, 2) * - (np.log(10000.0) / embeddings_dim))
    
    PE = np.zeros((seq_len, embeddings_dim))
    PE[:, 0::2] = np.sin(position * div_term) 
    
    PE[:, 1::2] = np.cos(position * div_term) 
    
    return PE

In [10]:
input_embeddings += positional_encoding(input_embeddings)

In [11]:
input_embeddings

array([[0.35440833, 1.4533334 , 0.40693244, 1.0910958 ],
       [1.4426009 , 1.16963014, 0.06967728, 1.48231352]])

In [12]:
def create_heads(heads_numbers, heads_n, heads_m):
    WK = []
    WQ = []
    WV = []

    for _ in range(heads_numbers):
        WK_temp = np.random.randint(0, 2, size=(heads_n, heads_m))
        WQ_temp = np.random.randint(0, 2, size=(heads_n, heads_m))
        WV_temp = np.random.randint(0, 2, size=(heads_n, heads_m))

        WK.append(WK_temp)
        WQ.append(WQ_temp)
        WV.append(WV_temp)

    return np.array(WK), np.array(WQ), np.array(WV)

In [13]:
WK, WQ, WV = create_heads(2, 4, 3)

In [14]:
WK1, WK2 = WK[0], WK[1]
WQ1, WQ2 = WQ[0], WQ[1]
WV1, WV2 = WV[0], WV[1]

In [15]:
K1 = input_embeddings @ WK1
K1

array([[2.21467417, 2.89883753, 1.85243657],
       [2.68190832, 4.09454455, 2.9945917 ]])

In [16]:
V1 = input_embeddings @ WV1
V1

array([[0.        , 1.86026584, 2.89883753],
       [0.        , 1.23930742, 4.09454455]])

In [17]:
Q1 = input_embeddings @ WQ1
Q1

array([[3.30576997, 1.4533334 , 1.49802825],
       [4.16422184, 1.16963014, 1.5519908 ]])

In [18]:
scores1 = Q1 @ K1.T
scores1

array([[14.30918305, 19.30249328],
       [15.48792679, 20.60474268]])

In [19]:
scores1 = scores1 / np.sqrt(embeddings_dim)
scores1

array([[ 7.15459153,  9.65124664],
       [ 7.7439634 , 10.30237134]])

In [20]:
def softmax(x, axis=1):
    return np.exp(x) / np.sum(np.exp(x), axis=axis, keepdims=True)

In [21]:
scores1 = softmax(scores1)
scores1

array([[0.076093  , 0.923907  ],
       [0.07186366, 0.92813634]])

In [22]:
attention1 = scores1 @ V1
attention1

array([[0.        , 1.28655801, 4.00355962],
       [0.        , 1.28393176, 4.00861667]])

In [23]:
def attention(input_embeddings, WK, WQ, WV):
    K = input_embeddings @ WK
    Q = input_embeddings @ WQ
    V = input_embeddings @ WV
    
    scores = Q @ K.T
    scores = scores / np.sqrt(embeddings_dim)
    scores = softmax(scores)
    scores = scores @ V
    return scores

In [24]:
attention1 = attention(input_embeddings, WK1, WQ1, WV1)
attention1

array([[0.        , 1.28655801, 4.00355962],
       [0.        , 1.28393176, 4.00861667]])

In [25]:
attention2 = attention(input_embeddings, WK2, WQ2, WV2)
attention2

array([[1.207324  , 0.        , 2.75214514],
       [1.21508683, 0.        , 2.75843142]])

In [26]:
attentions = np.concatenate([attention1, attention2], axis=1)
attentions

array([[0.        , 1.28655801, 4.00355962, 1.207324  , 0.        ,
        2.75214514],
       [0.        , 1.28393176, 4.00861667, 1.21508683, 0.        ,
        2.75843142]])

In [27]:
def multi_head_attention(input_embeddings, heads_numbers, heads_n, heads_m, W_n, W_m):
    WK, WQ, WV = create_heads(heads_numbers, heads_n, heads_m)

    attentions = []
    for i in range(len(WK)):
        WK_cur = WK[i]
        WQ_cur = WQ[i]
        WV_cur = WV[i]

        scores_cur = attention(input_embeddings, WK_cur, WQ_cur, WV_cur)
        attentions.append(scores_cur)

    W = np.random.rand(W_n, W_m)

    return np.concatenate(attentions, axis=1) @ W

In [28]:
Z = multi_head_attention(input_embeddings, 2, 4, 3, 6, 4)

In [29]:
Z

array([[6.60579383, 6.39852941, 3.40582287, 5.06714706],
       [6.61622183, 6.42713585, 3.44224757, 5.08628001]])

In [30]:
def layer_norm(input_embeddings, epsilon=1e-6):
    mean = input_embeddings.mean(axis=-1, keepdims=True)
    std = input_embeddings.std(axis=-1, keepdims=True)
    
    return (input_embeddings - mean) / (std + epsilon)

In [31]:
layer_norm(input_embeddings)

array([[-1.01692645,  1.35054195, -0.90377116,  0.57015566],
       [ 0.7000467 ,  0.22415465, -1.69348228,  0.76928093]])

In [32]:
def relu(x):
    return np.maximum(0, x)

In [33]:
def feed_forward(Z, W1, b1, W2, b2):
    return relu(Z.dot(W1) + b1).dot(W2) + b2

In [34]:
W1 = np.random.randn(4, 8)
W2 = np.random.randn(8, 4)
b1 = np.random.randn(8)
b2 = np.random.randn(4)

In [35]:
output_encoder = feed_forward(Z, W1, b1, W2, b2)
output_encoder

array([[ 17.56046869,  33.40746145, -11.15778284, -32.08958889],
       [ 17.62437233,  33.53126755, -11.16325082, -32.1339551 ]])

In [36]:
def encoder_layer(input_embeddings, heads_numbers, heads_n, heads_m, W_n, W_m):
    Z = multi_head_attention(input_embeddings, heads_numbers, heads_n, heads_m, W_n, W_m)

    W1 = np.random.randn(4, 8)
    W2 = np.random.randn(8, 4)
    b1 = np.random.randn(8)
    b2 = np.random.randn(4)

    output = feed_forward(Z, W1, b1, W2, b2)

    return layer_norm(output, Z)

In [37]:
output_encoder = encoder_layer(input_embeddings, 2, 4, 3, 6, 4)
output_encoder

array([[ 0.0798119 , -0.78269173, -0.26355583,  0.98491762],
       [ 0.07125516, -0.78104855, -0.25979459,  0.98593556]])

In [38]:
def encoder(input_embeddings, n=6):
    for _ in range(n):
        input_embeddings = encoder_layer(input_embeddings, 2, 4, 3, 6, 4)

    return input_embeddings

In [39]:
output_encoder = encoder(input_embeddings)
output_encoder

array([[ 1.80829931,  1.43183273, -2.20259763, -0.85643053],
       [ 1.80829931,  1.43183273, -2.20259763, -0.85643053]])

In [40]:
sos_embedding = np.random.rand(len(tokenzie_seq), embeddings_dim) # start of the sequence

In [41]:
sos_embedding += positional_encoding(sos_embedding)
sos_embedding

array([[0.13147021, 1.64945748, 0.3395816 , 1.49986676],
       [1.09633239, 0.91859958, 0.59780829, 1.75305504]])

In [42]:
decoder_self_attention = multi_head_attention(sos_embedding, 2, 4, 3, 6, 4)

In [43]:
decoder_self_attention

array([[6.68737952, 8.04150133, 4.97379977, 5.61006371],
       [6.75647136, 8.10109689, 5.00731091, 5.6778185 ]])

In [44]:
decoder_self_attention = layer_norm(decoder_self_attention + sos_embedding)
decoder_self_attention

array([[-0.2632541 ,  1.5611646 , -1.21955645, -0.07835404],
       [ 0.30604852,  1.25665889, -1.52503004, -0.03767737]])

In [45]:
def encoder_decoder_attention(encoder_output, attention_input, WQ, WK, WV):
    K = encoder_output @ WK    
    V = encoder_output @ WV    
    Q = attention_input @ WQ   

    scores = Q @ K.T
    scores = scores / np.sqrt(embeddings_dim)
    scores = softmax(scores)
    scores = scores @ V
    return scores

In [46]:
def multi_head_encoder_decoder_attention(encoder_output, attention_input, heads_numbers, heads_n, heads_m, W_n, W_m):
    WK, WQ, WV = create_heads(heads_numbers, heads_n, heads_m)

    attentions = []
    for i in range(len(WK)):
        WK_cur = WK[i]
        WQ_cur = WQ[i]
        WV_cur = WV[i]
        
        scores_cur = encoder_decoder_attention(encoder_output, attention_input, WK_cur, WQ_cur, WV_cur)
        attentions.append(scores_cur)
    
    W = np.random.rand(W_n, W_m)
    concatenated_attention = np.concatenate(attentions, axis=1) @ W

    return concatenated_attention


In [47]:
Z_encoder_decoder = multi_head_encoder_decoder_attention(output_encoder, decoder_self_attention, 2, 4, 3, 6, 4)
Z_encoder_decoder

array([[-4.02570883, -0.8047363 , -1.93070959, -3.09149961],
       [-4.02570883, -0.8047363 , -1.93070959, -3.09149961]])

In [48]:
def decoder_layer(decoder_inputs, encoder_outputs, heads_numbers, heads_n, heads_m, W_n, W_m):
    Z = multi_head_attention(decoder_inputs, heads_numbers, heads_n, heads_m, W_n, W_m)

    W1 = np.random.randn(4, 8)
    W2 = np.random.randn(8, 4)
    b1 = np.random.randn(8)
    b2 = np.random.randn(4)

    output = feed_forward(Z, W1, b1, W2, b2)

    output = layer_norm(output, Z)

    Z_encoder_decoder = multi_head_encoder_decoder_attention(encoder_outputs, decoder_self_attention, 2, 4, 3, 6, 4)

    Z_encoder_decoder = layer_norm(Z_encoder_decoder + Z)

    output = feed_forward(Z_encoder_decoder, W1, b1, W2, b2)
    return layer_norm(output + Z_encoder_decoder)

In [49]:
output_decoder = decoder_layer(sos_embedding, output_encoder, 2, 4, 3, 6, 4)
output_decoder

array([[ 1.00811647, -0.09434004, -1.58766331,  0.67388687],
       [ 1.00738228, -0.08646716, -1.59042641,  0.66951129]])

In [50]:
def decoder(decoder_input, output_encoder, n=6):
    for _ in range(n):
        decoder_input = decoder_layer(decoder_input, output_encoder, 2, 4, 3, 6, 4)

    return decoder_input

In [51]:
decoder_outputs = decoder(sos_embedding, output_decoder)
decoder_outputs

array([[-1.67847945,  0.35519744,  0.96247322,  0.3608088 ],
       [-1.67847968,  0.35519772,  0.96247245,  0.36080951]])

In [52]:
def linear(x, W, b):
    return np.dot(x, W) + b

In [56]:
def output_probabilities(decoder_outputs, embeddings_dim, vocabulary_length):
    for decoder_output in decoder_outputs:
        linear_output = linear(decoder_output, np.random.randn(embeddings_dim, vocabulary_length), np.random.randn(vocabulary_length))

        softmax_output = softmax(linear_output, axis=0)

        print(softmax_output)

In [57]:
output_probabilities(decoder_outputs, embeddings_dim, 4)

[0.11935165 0.21525809 0.66339475 0.00199552]
[0.19325242 0.62772799 0.00160382 0.17741577]
