In [55]:
from nltk import WordPunctTokenizer
import numpy as np

In [56]:
inputs = 'Привет мир'

In [57]:
tokenzier = WordPunctTokenizer()

In [58]:
tokenzie_seq = tokenzier.tokenize(inputs.lower())

In [59]:
tokenzie_seq

['привет', 'мир']

In [60]:
embeddings_dim = 4

In [61]:
input_embeddings = np.random.rand(len(tokenzie_seq), embeddings_dim)

In [62]:
input_embeddings

array([[0.11126297, 0.03067202, 0.64445921, 0.02823956],
       [0.41482301, 0.87789299, 0.48406892, 0.39927242]])

In [63]:
def positional_encoding(input_embeddings):
    seq_len, embeddings_dim = input_embeddings.shape

    position = np.arange(seq_len)[:, np.newaxis]
    
    div_term = np.exp(np.arange(0, embeddings_dim, 2) * - (np.log(10000.0) / embeddings_dim))
    
    PE = np.zeros((seq_len, embeddings_dim))
    PE[:, 0::2] = np.sin(position * div_term) 
    
    PE[:, 1::2] = np.cos(position * div_term) 
    
    return PE

In [64]:
input_embeddings += positional_encoding(input_embeddings)

In [65]:
input_embeddings

array([[0.11126297, 1.03067202, 0.64445921, 1.02823956],
       [1.256294  , 1.41819529, 0.49406875, 1.39922242]])

In [66]:
def create_heads(heads_numbers, heads_n, heads_m):
    WK = []
    WQ = []
    WV = []

    for _ in range(heads_numbers):
        WK_temp = np.random.randint(0, 2, size=(heads_n, heads_m))
        WQ_temp = np.random.randint(0, 2, size=(heads_n, heads_m))
        WV_temp = np.random.randint(0, 2, size=(heads_n, heads_m))

        WK.append(WK_temp)
        WQ.append(WQ_temp)
        WV.append(WV_temp)

    return np.array(WK), np.array(WQ), np.array(WV)

In [67]:
WK, WQ, WV = create_heads(2, 4, 3)

In [68]:
WK1, WK2 = WK[0], WK[1]
WQ1, WQ2 = WQ[0], WQ[1]
WV1, WV2 = WV[0], WV[1]

In [69]:
K1 = input_embeddings @ WK1
K1

array([[0.75572218, 0.11126297, 0.        ],
       [1.75036275, 1.256294  , 0.        ]])

In [70]:
V1 = input_embeddings @ WV1
V1

array([[0.11126297, 2.05891158, 1.78639421],
       [1.256294  , 2.81741771, 3.16855804]])

In [71]:
Q1 = input_embeddings @ WQ1
Q1

array([[1.78396175, 2.81463377, 1.67513123],
       [3.14958516, 4.56778046, 1.91226404]])

In [72]:
scores1 = Q1 @ K1.T
scores1

array([[ 1.66134399,  6.6585877 ],
       [ 2.88843622, 11.25139172]])

In [73]:
scores1 = scores1 / np.sqrt(embeddings_dim)
scores1

array([[0.830672  , 3.32929385],
       [1.44421811, 5.62569586]])

In [74]:
def softmax(x, axis=1):
    return np.exp(x) / np.sum(np.exp(x), axis=axis, keepdims=True)

In [75]:
scores1 = softmax(scores1)
scores1

array([[0.07595485, 0.92404515],
       [0.01504607, 0.98495393]])

In [76]:
attention1 = scores1 @ V1
attention1

array([[1.16932334, 2.75980549, 3.06357599],
       [1.23906578, 2.80600517, 3.1477619 ]])

In [77]:
def attention(input_embeddings, WK, WQ, WV):
    K = input_embeddings @ WK
    Q = input_embeddings @ WQ
    V = input_embeddings @ WV
    
    scores = Q @ K.T
    scores = scores / np.sqrt(embeddings_dim)
    scores = softmax(scores)
    scores = scores @ V
    return scores

In [78]:
attention1 = attention(input_embeddings, WK1, WQ1, WV1)
attention1

array([[1.16932334, 2.75980549, 3.06357599],
       [1.23906578, 2.80600517, 3.1477619 ]])

In [79]:
attention2 = attention(input_embeddings, WK2, WQ2, WV2)
attention2

array([[2.75841497, 1.37036437, 1.37036437],
       [2.81136864, 1.39626383, 1.39626383]])

In [80]:
attentions = np.concatenate([attention1, attention2], axis=1)
attentions

array([[1.16932334, 2.75980549, 3.06357599, 2.75841497, 1.37036437,
        1.37036437],
       [1.23906578, 2.80600517, 3.1477619 , 2.81136864, 1.39626383,
        1.39626383]])

In [81]:
def multi_head_attention(input_embeddings, heads_numbers, heads_n, heads_m, W_n, W_m):
    WK, WQ, WV = create_heads(heads_numbers, heads_n, heads_m)

    attentions = []
    for i in range(len(WK)):
        WK_cur = WK[i]
        WQ_cur = WQ[i]
        WV_cur = WV[i]

        scores_cur = attention(input_embeddings, WK_cur, WQ_cur, WV_cur)
        attentions.append(scores_cur)

    W = np.random.rand(W_n, W_m)

    return np.concatenate(attentions, axis=1) @ W

In [82]:
Z = multi_head_attention(input_embeddings, 2, 4, 3, 6, 4)

In [83]:
Z

array([[6.07265691, 5.48709887, 6.35985769, 6.60340461],
       [6.17772097, 5.59596642, 6.47265251, 6.7092671 ]])

In [84]:
def layer_norm(input_embeddings, epsilon=1e-6):
    mean = input_embeddings.mean(axis=-1, keepdims=True)
    std = input_embeddings.std(axis=-1, keepdims=True)
    
    return (input_embeddings - mean) / (std + epsilon)

In [85]:
layer_norm(input_embeddings)

array([[-1.57381377,  0.86877517, -0.15727427,  0.86231287],
       [ 0.30151163,  0.72840801, -1.70830055,  0.67838091]])

In [86]:
def relu(x):
    return np.maximum(0, x)

In [87]:
def feed_forward(Z, W1, b1, W2, b2):
    return relu(Z.dot(W1) + b1).dot(W2) + b2

In [88]:
W1 = np.random.randn(4, 8)
W2 = np.random.randn(8, 4)
b1 = np.random.randn(8)
b2 = np.random.randn(4)

In [89]:
output_encoder = feed_forward(Z, W1, b1, W2, b2)
output_encoder

array([[  6.78340553, -18.86771623, -43.27758122,   7.57145825],
       [  6.87007077, -19.21147192, -44.121359  ,   7.83127799]])

In [90]:
def encoder_layer(input_embeddings, heads_numbers, heads_n, heads_m, W_n, W_m):
    Z = multi_head_attention(input_embeddings, heads_numbers, heads_n, heads_m, W_n, W_m)

    W1 = np.random.randn(4, 8)
    W2 = np.random.randn(8, 4)
    b1 = np.random.randn(8)
    b2 = np.random.randn(4)

    output = feed_forward(Z, W1, b1, W2, b2)

    return layer_norm(output, Z)

In [91]:
output_encoder = encoder_layer(input_embeddings, 2, 4, 3, 6, 4)
output_encoder

array([[-0.82659112, -0.44783026,  0.23297816,  0.99540385],
       [-0.82727985, -0.44868242,  0.23212757,  0.99956175]])

In [92]:
def encoder(input_embeddings, n=6):
    for _ in range(n):
        input_embeddings = encoder_layer(input_embeddings, 2, 4, 3, 6, 4)

    return input_embeddings

In [93]:
output_encoder = encoder(input_embeddings)
output_encoder

array([[-0.21160828,  1.34875022, -1.51187604, -3.69495817],
       [-0.21160828,  1.34875022, -1.51187604, -3.69495817]])

In [94]:
sos_embedding = np.random.rand(len(tokenzie_seq), embeddings_dim) # start of the sequence

In [95]:
sos_embedding += positional_encoding(sos_embedding)
sos_embedding

array([[0.60728785, 1.70612029, 0.32796654, 1.41379413],
       [1.72342342, 1.0525022 , 0.03861733, 1.41376623]])

In [96]:
decoder_self_attention = multi_head_attention(sos_embedding, 2, 4, 3, 6, 4)

In [97]:
decoder_self_attention

array([[5.01838214, 5.56991948, 9.1156004 , 5.14239635],
       [4.97027017, 5.49251789, 8.99698968, 5.03849665]])

In [98]:
decoder_self_attention = layer_norm(decoder_self_attention + sos_embedding)
decoder_self_attention

array([[-1.13615104,  0.03598941,  1.57543004, -0.47526842],
       [-0.45439745, -0.59284717,  1.72647041, -0.67922579]])

In [99]:
def encoder_decoder_attention(encoder_output, attention_input, WQ, WK, WV):
    K = encoder_output @ WK    
    V = encoder_output @ WV    
    Q = attention_input @ WQ   

    scores = Q @ K.T
    scores = scores / np.sqrt(embeddings_dim)
    scores = softmax(scores)
    scores = scores @ V
    return scores

In [100]:
def multi_head_encoder_decoder_attention(encoder_output, attention_input, heads_numbers, heads_n, heads_m, W_n, W_m):
    WK, WQ, WV = create_heads(heads_numbers, heads_n, heads_m)

    attentions = []
    for i in range(len(WK)):
        WK_cur = WK[i]
        WQ_cur = WQ[i]
        WV_cur = WV[i]
        
        scores_cur = encoder_decoder_attention(encoder_output, attention_input, WK_cur, WQ_cur, WV_cur)
        attentions.append(scores_cur)
    
    W = np.random.rand(W_n, W_m)
    concatenated_attention = np.concatenate(attentions, axis=1) @ W

    return concatenated_attention


In [101]:
Z_encoder_decoder = multi_head_encoder_decoder_attention(output_encoder, decoder_self_attention, 2, 4, 3, 6, 4)
Z_encoder_decoder

array([[-13.18388117, -11.24248645, -12.14853935, -11.9222812 ],
       [-13.18388117, -11.24248645, -12.14853935, -11.9222812 ]])

In [102]:
def decoder_layer(decoder_inputs, encoder_outputs, heads_numbers, heads_n, heads_m, W_n, W_m):
    Z = multi_head_attention(decoder_inputs, heads_numbers, heads_n, heads_m, W_n, W_m)

    W1 = np.random.randn(4, 8)
    W2 = np.random.randn(8, 4)
    b1 = np.random.randn(8)
    b2 = np.random.randn(4)

    output = feed_forward(Z, W1, b1, W2, b2)

    output = layer_norm(output, Z)

    Z_encoder_decoder = multi_head_encoder_decoder_attention(encoder_outputs, decoder_self_attention, 2, 4, 3, 6, 4)

    Z_encoder_decoder = layer_norm(Z_encoder_decoder + Z)

    output = feed_forward(Z_encoder_decoder, W1, b1, W2, b2)
    return layer_norm(output + Z_encoder_decoder)

In [103]:
output_decoder = decoder_layer(sos_embedding, output_encoder, 2, 4, 3, 6, 4)
output_decoder

array([[ 0.06577883, -1.5820185 ,  1.17248329,  0.34375637],
       [ 0.0662409 , -1.5817665 ,  1.17322211,  0.34230349]])

In [104]:
def decoder(decoder_input, output_encoder, n=6):
    for _ in range(n):
        decoder_input = decoder_layer(decoder_input, output_encoder, 2, 4, 3, 6, 4)

    return decoder_input

In [105]:
decoder_outputs = decoder(sos_embedding, output_decoder)
decoder_outputs

array([[ 0.69800707,  1.22136603, -1.2588952 , -0.6604779 ],
       [ 0.69800706,  1.22136603, -1.25889523, -0.66047786]])

In [106]:
def linear(x, W, b):
    return np.dot(x, W) + b

In [107]:
def output_probabilities(decoder_outputs, embeddings_dim, vocabulary_length):
    for decoder_output in decoder_outputs:
        linear_output = linear(decoder_output, np.random.randn(embeddings_dim, vocabulary_length), np.random.randn(vocabulary_length))

        softmax_output = softmax(linear_output, axis=0)

        print(softmax_output)

In [108]:
output_probabilities(decoder_outputs, embeddings_dim, 4)

[0.08062325 0.31888781 0.59731943 0.00316952]
[0.00243138 0.97276776 0.00314775 0.02165311]


In [210]:
from typing import Any


class Embeddings():
    def __init__(self, embeddings_dim):
        self.embeddings_dim = embeddings_dim

        self.words_list_ru = ['<sos>', 'привет', 'мир', '<eos>', '<unk>']
        self.words_list_en = ['<sos>', 'hello', 'world', '<eos>', '<unk>']

        self.create_input_ids_dict()
        self.create_embeddings()


    def create_input_ids_dict(self):
        self.input_ids_dict = {}

        for i in range(len(self.words_list_ru)):
            self.input_ids_dict[i] = self.words_list_ru[i]


    def get_input_id(self, token):
        return list(self.input_ids_dict.keys())[list(self.input_ids_dict.values()).index(token)]
    
    
    def get_input_ids(self, tokens):
        input_ids = []
        for token in tokens:
            input_ids.append(self.get_input_id(token))

        return input_ids
    

    def create_embeddings(self):
        self.embeddings_dict = {}

        for i in range(len(self.words_list_ru)):
            self.embeddings_dict[i] = np.random.rand(self.embeddings_dim)


    def get_embeddings(self, input_ids):
        embeddings = []
        for input_id in input_ids:
            embeddings.append(self.embeddings_dict[input_id])

        return np.array(embeddings)
    
    def __call__(self, tokens):
        self.input_ids = self.get_input_ids(tokens)

        return self.get_embeddings(self.input_ids)

In [223]:
class SimpleTransformer():
    def __init__(self, 
                 embeddings_dim = 4,
                 
                 ):
        self.tokenzier = WordPunctTokenizer()

        self.embeddings_dim = embeddings_dim
        self.embeddings = Embeddings(self.embeddings_dim)


    def __call__(self, inputs):
        tokenzie_seq = self.tokenzier.tokenize(inputs.lower())
        self.input_embeddings = self.embeddings(tokenzie_seq)

    
    def positional_encoding(input_embeddings):
        seq_len, embeddings_dim = input_embeddings.shape

        position = np.arange(seq_len)[:, np.newaxis]
        
        div_term = np.exp(np.arange(0, embeddings_dim, 2) * - (np.log(10000.0) / embeddings_dim))
        
        PE = np.zeros((seq_len, embeddings_dim))
        PE[:, 0::2] = np.sin(position * div_term) 
        
        PE[:, 1::2] = np.cos(position * div_term) 
        
        return PE
    
    
    def create_heads(heads_numbers, heads_n, heads_m):
        WK = []
        WQ = []
        WV = []

        for _ in range(heads_numbers):
            WK_temp = np.random.randint(0, 2, size=(heads_n, heads_m))
            WQ_temp = np.random.randint(0, 2, size=(heads_n, heads_m))
            WV_temp = np.random.randint(0, 2, size=(heads_n, heads_m))

            WK.append(WK_temp)
            WQ.append(WQ_temp)
            WV.append(WV_temp)

        return np.array(WK), np.array(WQ), np.array(WV)
    

    def softmax(x, axis=1):
        return np.exp(x) / np.sum(np.exp(x), axis=axis, keepdims=True)


    def attention(input_embeddings, WK, WQ, WV):
        K = input_embeddings @ WK
        Q = input_embeddings @ WQ
        V = input_embeddings @ WV
        
        scores = Q @ K.T
        scores = scores / np.sqrt(embeddings_dim)
        scores = softmax(scores)
        scores = scores @ V
        return scores


    def multi_head_attention(input_embeddings, heads_numbers, heads_n, heads_m, W_n, W_m):
        WK, WQ, WV = create_heads(heads_numbers, heads_n, heads_m)

        attentions = []
        for i in range(len(WK)):
            WK_cur = WK[i]
            WQ_cur = WQ[i]
            WV_cur = WV[i]

            scores_cur = attention(input_embeddings, WK_cur, WQ_cur, WV_cur)
            attentions.append(scores_cur)

        W = np.random.rand(W_n, W_m)

        return np.concatenate(attentions, axis=1) @ W
    

    def layer_norm(input_embeddings, epsilon=1e-6):
        mean = input_embeddings.mean(axis=-1, keepdims=True)
        std = input_embeddings.std(axis=-1, keepdims=True)
        
        return (input_embeddings - mean) / (std + epsilon)


    def relu(x):
        return np.maximum(0, x)
    

    def encoder_layer(input_embeddings, heads_numbers, heads_n, heads_m, W_n, W_m):
        Z = multi_head_attention(input_embeddings, heads_numbers, heads_n, heads_m, W_n, W_m)

        W1 = np.random.randn(4, 8)
        W2 = np.random.randn(8, 4)
        b1 = np.random.randn(8)
        b2 = np.random.randn(4)

        output = feed_forward(Z, W1, b1, W2, b2)

        return layer_norm(output, Z)
    

    def encoder(input_embeddings, n=6):
        for _ in range(n):
            input_embeddings = encoder_layer(input_embeddings, 2, 4, 3, 6, 4)

        return input_embeddings
    

    def encoder_decoder_attention(encoder_output, attention_input, WQ, WK, WV):
        K = encoder_output @ WK    
        V = encoder_output @ WV    
        Q = attention_input @ WQ   

        scores = Q @ K.T
        scores = scores / np.sqrt(embeddings_dim)
        scores = softmax(scores)
        scores = scores @ V
        return scores
    

    def multi_head_encoder_decoder_attention(encoder_output, attention_input, heads_numbers, heads_n, heads_m, W_n, W_m):
        WK, WQ, WV = create_heads(heads_numbers, heads_n, heads_m)

        attentions = []
        for i in range(len(WK)):
            WK_cur = WK[i]
            WQ_cur = WQ[i]
            WV_cur = WV[i]
            
            scores_cur = encoder_decoder_attention(encoder_output, attention_input, WK_cur, WQ_cur, WV_cur)
            attentions.append(scores_cur)
        
        W = np.random.rand(W_n, W_m)
        concatenated_attention = np.concatenate(attentions, axis=1) @ W

        return concatenated_attention
    

    def decoder_layer(decoder_inputs, encoder_outputs, heads_numbers, heads_n, heads_m, W_n, W_m):
        Z = multi_head_attention(decoder_inputs, heads_numbers, heads_n, heads_m, W_n, W_m)

        W1 = np.random.randn(4, 8)
        W2 = np.random.randn(8, 4)
        b1 = np.random.randn(8)
        b2 = np.random.randn(4)

        output = feed_forward(Z, W1, b1, W2, b2)

        output = layer_norm(output, Z)

        Z_encoder_decoder = multi_head_encoder_decoder_attention(encoder_outputs, decoder_self_attention, 2, 4, 3, 6, 4)

        Z_encoder_decoder = layer_norm(Z_encoder_decoder + Z)

        output = feed_forward(Z_encoder_decoder, W1, b1, W2, b2)
        return layer_norm(output + Z_encoder_decoder)
    

    def decoder(decoder_input, output_encoder, n=6):
        for _ in range(n):
            decoder_input = decoder_layer(decoder_input, output_encoder, 2, 4, 3, 6, 4)

        return decoder_input


    def linear(x, W, b):
        return np.dot(x, W) + b


    def output_probabilities(decoder_outputs, embeddings_dim, vocabulary_length):
        for decoder_output in decoder_outputs:
            linear_output = linear(decoder_output, np.random.randn(embeddings_dim, vocabulary_length), np.random.randn(vocabulary_length))

            softmax_output = softmax(linear_output, axis=0)

            return softmax_output

In [220]:
transformer =  SimpleTransformer()

In [221]:
inputs = 'Привет мир'
transformer_outputs = transformer(inputs)

In [222]:
transformer_outputs