In [1]:
import torch 
import torch.nn as nn
import fasttext as ft
import math
import torch.nn.functional as F

d_model = 10
new_dim = 5


def getWordVectors(sentence):
    sentence = sentence.split(' ')
    vecs = torch.rand((len(sentence),10))
    return vecs

In [None]:
class feedforward_Encoder(nn.Module):
    def __init__(self):
        super(feedforward_Encoder,self).__init__()
        self.l1 = nn.Linear(10,40)
        self.l2 = nn.Linear(40,10)
        
    def forward(self,x):
        y = F.relu(self.l1(x))
        y = self.l2(y)
        return y
        
    

In [None]:
class Encoder():
    def __init__(self, vectorRepresentations):
        self.vectorRepresentations = vectorRepresentations
        self.d_model = d_model
        self.new_dim = new_dim
        self.positional_encodings = None
        self.first_sublayer_output = None
        self.keys = None
        self.values = None
        


    def PositionalEncoding(self,wordVecs):
        for pos in range(wordVecs.shape[0]):
            for i in range(wordVecs[pos].shape[0]):
                if i%2 == 0:
                    wordVecs[pos][i] = wordVecs[pos][i] + math.sin(pos/(10000**(2*i/self.d_model)))
                else:
                    wordVecs[pos][i] = wordVecs[pos][i] + math.cos(pos/(10000**(2*i/self.d_model))) 
                    
        self.positional_encodings = wordVecs
        return wordVecs


    def get_qkv_weights(self,r,c):
        query_weights = torch.rand((r,c))
        key_weights = torch.rand((r,c))
        value_weights = torch.rand((r,c))
        self.keys = key_weights
        self.values = value_weights
        
        return query_weights, key_weights, value_weights
    
    
    def get_keys_and_values(self):
        return self.keys, self.values
    
    
    
    def qkvs(self,vectorMatrix, new_dim):
        query_weights, key_weights, value_weights = self.get_qkv_weights(self.d_model,new_dim)
        return torch.matmul(vectorMatrix, query_weights), torch.matmul(vectorMatrix, key_weights), \
        torch.matmul(vectorMatrix, value_weights) 
        # Check for transposeness in matrix multiplication
    
    
    def qk_dotproducts(self,queries, keys):
        dotproduct_matrix = torch.Tensor([])
        for i in queries:
            dotproduct_vector = torch.Tensor([])
            for j in keys:
                dotproduct_vector = torch.cat([dotproduct_vector, torch.dot(i,j).reshape(-1)])
            dotproduct_matrix = torch.cat([dotproduct_matrix, dotproduct_vector.reshape(1,-1)])
        return dotproduct_matrix
    
    
    def getSoftmaxed_qkdp(self,qk_dotproductmatrix):
        sm = nn.Softmax(dim = 0)
        sm_matrix = torch.tensor([])
        for i in qk_dotproductmatrix:
            sm_matrix = torch.cat([sm_matrix, sm(i).reshape(1,-1)])
        return sm_matrix
    
    
    def getSoftmaxWeightedValues(self,softmaxed_qkdp, values):
        dim2_mat = torch.tensor([])
        dim3_mat = torch.tensor([])
        outer_loop_range = softmaxed_qkdp.shape[0]
        inner_loop_range = values.shape[0]
        for i in range(outer_loop_range):
            for j in range(inner_loop_range):
                dim2_mat = torch.cat([dim2_mat, (softmaxed_qkdp[i][j]*values[j]).reshape(-1)])
            dim3_mat = torch.cat([dim3_mat, dim2_mat.reshape(1,values.shape[0],values.shape[1])])
            dim2_mat = torch.tensor([]) 
        return dim3_mat
    
    
    
    def getWeightedSum(self,softmax_weighted_values):
        next_layer_input = torch.tensor([])
        for i in softmax_weighted_values:
            transposed_i = i.t()
            new_word_representation = torch.tensor([])
            for j in transposed_i:
                rowsum = j.sum()
                new_word_representation = torch.cat([new_word_representation, rowsum.reshape(-1)])
            next_layer_input = \
            torch.cat([next_layer_input, new_word_representation.reshape(1,new_word_representation.shape[0])])    
        return next_layer_input
        
    
    
    def returnRepresentation(self):
        pos_encoded = self.PositionalEncoding(self.vectorRepresentations)
        new_dim = self.new_dim
        queries, keys, values = self.qkvs(pos_encoded, new_dim)
        qk_dotproductmatrix = self.qk_dotproducts(queries, keys)
        d_k = keys.shape[1] # to be changed later to square root of 'key' vector dimension
        qk_dotproductmatrix/=d_k
        softmaxed_qkdp = self.getSoftmaxed_qkdp(qk_dotproductmatrix)
        softmax_weighted_values = self.getSoftmaxWeightedValues(softmaxed_qkdp, values)
        weightedSum = self.getWeightedSum(softmax_weighted_values)
        return weightedSum  
    
    
    def getW0(self):
        self.t = torch.randn(self.d_model, self.d_model).float()
        return self.t
    
    
    
    def multiHeadAttention(self, wordVecs, heads=2):
        listOfHeads = []
        op = torch.tensor([])
        for i in range(heads):
            temp = self.returnRepresentation()
            listOfHeads.append(temp)
    
        outputRepresentation = torch.tensor([])
        for i in range(listOfHeads[0].shape[0]):
            outputRepresentation = torch.cat([listOfHeads[0][i],listOfHeads[1][i]])
            op = torch.cat([op, outputRepresentation.reshape(1,outputRepresentation.shape[0])])
        
        W0 = self.getW0()
        projected_attention_vecs = torch.matmul(op, W0) 
        #Layer Normalisation
        layer_norm_one = nn.LayerNorm(projected_attention_vecs.size()[1])
        add_and_norm = layer_norm_one(projected_attention_vecs+self.positional_encodings)
        ##############   
        self.first_sublayer_output = add_and_norm
        return add_and_norm
    
    
    def ff_and_addnorm(self):
        received_representations = self.multiHeadAttention(self.vectorRepresentations)
        ffobj = feedforward_Encoder()
        activations = torch.tensor([])
        for i in received_representations:
            activations = torch.cat([activations, ffobj(i).reshape(1,received_representations.shape[1])])
         
        layer_norm_two = nn.LayerNorm(activations.size()[1])
        add_and_norm = layer_norm_two(activations + self.first_sublayer_output)
        return add_and_norm
        
         
    def forward(self):
        return self.ff_and_addnorm()


In [None]:
english_wordVecs = getWordVectors('Hi there this is nuts')
# wordVecs

In [None]:
for i in range(2):
    encoding_layer = Encoder(english_wordVecs)
    english_wordVecs = encoding_layer.forward()

    
encoder_keys, encoder_values = a.get_keys_and_values()

# Decoder

In [2]:
class Decoder():
    
    def __init__(self, vectorRepresentations):
        self.vectorRepresentations = vectorRepresentations
        self.positional_encodings = None
        self.d_model = d_model
        self.new_dim = new_dim
        
    def PositionalEncoding(self,wordVecs):
        for pos in range(wordVecs.shape[0]):
            for i in range(wordVecs[pos].shape[0]):
                if i%2 == 0:
                    wordVecs[pos][i] = wordVecs[pos][i] + math.sin(pos/(10000**(2*i/self.d_model)))
                else:
                    wordVecs[pos][i] = wordVecs[pos][i] + math.cos(pos/(10000**(2*i/self.d_model))) 
                    
        self.positional_encodings = wordVecs
        return wordVecs
    
    
    def get_qkv_weights(self,r,c):
        query_weights = torch.rand((r,c))
        key_weights = torch.rand((r,c))
        value_weights = torch.rand((r,c))
        self.keys = key_weights
        self.values = value_weights
        return query_weights, key_weights, value_weights
    
    
    def qkvs(self,vectorMatrix, new_dim):
        query_weights, key_weights, value_weights = self.get_qkv_weights(self.d_model,new_dim)
        return torch.matmul(vectorMatrix, query_weights), torch.matmul(vectorMatrix, key_weights), \
        torch.matmul(vectorMatrix, value_weights) 
    
    
    def qk_dotproducts(self,queries, keys):
        dotproduct_matrix = torch.Tensor([])
        for i in queries:
            dotproduct_vector = torch.Tensor([])
            for j in keys:
                dotproduct_vector = torch.cat([dotproduct_vector, torch.dot(i,j).reshape(-1)])
            dotproduct_matrix = torch.cat([dotproduct_matrix, dotproduct_vector.reshape(1,-1)])
        return dotproduct_matrix
    
    
    def conditionedSoftmax(self, i):
        
        temp = torch.tensor([])
        softmax_tensor = torch.tensor([])
        sm = nn.Softmax(dim = 0)

        for element in i:
            if element != 0.0:
                temp = torch.cat([temp, element.reshape(-1)])
        softmax_tensor = sm(temp)
        
        for j in range(softmax_tensor.shape[0]):
            i[j] = softmax_tensor[j]
            
#         print(i)
    
        return i
    
    
    
    def getSoftmaxed_qkdp(self,qk_dotproductmatrix):
        sm_matrix = torch.tensor([])
        for i in qk_dotproductmatrix:
            sm_matrix = torch.cat([sm_matrix, self.conditionedSoftmax(i).reshape(1,-1)])
        return sm_matrix
    
     
    def getSoftmaxWeightedValues(self,softmaxed_qkdp, values):
        dim2_mat = torch.tensor([])
        dim3_mat = torch.tensor([])
        outer_loop_range = softmaxed_qkdp.shape[0]
        inner_loop_range = values.shape[0]
        for i in range(outer_loop_range):
            for j in range(inner_loop_range):
                dim2_mat = torch.cat([dim2_mat, (softmaxed_qkdp[i][j]*values[j]).reshape(-1)])
            dim3_mat = torch.cat([dim3_mat, dim2_mat.reshape(1,values.shape[0],values.shape[1])])
            dim2_mat = torch.tensor([]) 
        return dim3_mat
    
    
    
    def getWeightedSum(self,softmax_weighted_values):
        next_layer_input = torch.tensor([])
        for i in softmax_weighted_values:
            transposed_i = i.t()
            new_word_representation = torch.tensor([])
            for j in transposed_i:
                rowsum = j.sum()
                new_word_representation = torch.cat([new_word_representation, rowsum.reshape(-1)])
            next_layer_input = \
            torch.cat([next_layer_input, new_word_representation.reshape(1,new_word_representation.shape[0])])    
        return next_layer_input
    
    
    
    
    def returnRepresentation(self):
        pos_encoded = self.PositionalEncoding(self.vectorRepresentations)
        new_dim = self.new_dim
        queries, keys, values = self.qkvs(pos_encoded, new_dim)
        qk_dotproductmatrix = self.qk_dotproducts(queries, keys)
        
        
        #Creating mask matrix
        maskmatrix = [[0.0 for i in range(qk_dotproductmatrix.shape[1])]for j\
                      in range(qk_dotproductmatrix.shape[0])]
        for i in range(len(maskmatrix)):
            for j in range(i+1):
                maskmatrix[i][j] = 1
        maskmatrix = torch.tensor(maskmatrix).float()
        #######################
        
        
        d_k = keys.shape[1] # to be changed later to square root of 'key' vector dimension
        qk_dotproductmatrix/=d_k
        qk_dotproductmatrix = qk_dotproductmatrix * maskmatrix
#         return qk_dotproductmatrix
        softmaxed_qkdp = self.getSoftmaxed_qkdp(qk_dotproductmatrix)
#         return softmaxed_qkdp    
        softmax_weighted_values = self.getSoftmaxWeightedValues(softmaxed_qkdp, values)
        print(softmax_weighted_values)
        weightedSum = self.getWeightedSum(softmax_weighted_values)
        return weightedSum 
        
     
    def getW0(self):
        self.t = torch.randn(self.d_model, self.d_model).float()
        return self.t
    
    
    def maskedMultiHeadAttention_add_norm(self, wordVecs, heads=2):
        listOfHeads = []
        op = torch.tensor([])
        for i in range(heads):
            temp = self.returnRepresentation()
            listOfHeads.append(temp)
    
        outputRepresentation = torch.tensor([])
        for i in range(listOfHeads[0].shape[0]):
            outputRepresentation = torch.cat([listOfHeads[0][i],listOfHeads[1][i]])
            op = torch.cat([op, outputRepresentation.reshape(1,outputRepresentation.shape[0])])
        
        W0 = self.getW0()
        projected_attention_vecs = torch.matmul(op, W0) 
        #Layer Normalisation
        layer_norm_one = nn.LayerNorm(projected_attention_vecs.size()[1])
        add_and_norm_one = layer_norm_one(projected_attention_vecs+self.positional_encodings)
        ##############   
        self.first_sublayer_output = add_and_norm_one
        return add_and_norm_one
        
        
        
    
    
    
    
    
        
        
    
    

In [3]:
french_wordVecs = getWordVectors('Dans la vallée de la mort')
# french_wordVecs

In [4]:
a = Decoder(french_wordVecs)
a.maskedMultiHeadAttention_add_norm(french_wordVecs)

tensor([[[4.0955e+00, 5.4431e+00, 6.1252e+00, 2.4585e+00, 4.6336e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00]],

        [[3.0736e+00, 4.0850e+00, 4.5969e+00, 1.8451e+00, 3.4775e+00],
         [1.0820e+00, 1.4249e+00, 1.5252e+00, 6.6912e-01, 1.0588e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
         [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00]],

        [[2.9889e-01, 3.9724e-01, 4.4702e-01, 1.7943e-01, 3.3817e-01],
         [8.5225e-02, 1.1224e-01, 1.2014e-01, 5.2704e-02, 8.3400e-02],
  

tensor([[-0.8837, -1.5631,  0.3750, -0.0596,  0.4754,  2.3426,  0.3189, -0.0892,
         -0.0175, -0.8988],
        [-0.8471, -1.4398,  0.2267, -0.2460,  0.5746,  2.3508,  0.3408, -0.1208,
          0.1962, -1.0353],
        [-0.7381, -1.5955,  0.4828, -0.2323,  0.4887,  2.2831,  0.3011, -0.1163,
          0.1488, -1.0222],
        [-0.7959, -1.5796,  0.3865, -0.1677,  0.3478,  2.4010,  0.2708, -0.1987,
          0.1551, -0.8192],
        [-0.8681, -1.5336,  0.4178, -0.1895,  0.3109,  2.3930,  0.3527, -0.1811,
          0.1271, -0.8292],
        [-0.8732, -1.5923,  0.4085, -0.1370,  0.3352,  2.3639,  0.3352, -0.2284,
          0.1761, -0.7879]], grad_fn=<NativeLayerNormBackward>)