In [33]:
import torch 
import torch.nn as nn
import fasttext as ft
import math
import torch.nn.functional as F
import pickle
import torch.optim as optim
from gensim.models import Word2Vec, KeyedVectors
import nltk

d_model = 10
new_dim = 5


# def getWordVectors(sentence):
#     sentence = sentence.split(' ')
#     vecs = torch.rand((len(sentence),10))
#     return vecs

In [3]:
class feedforward_Encoder(nn.Module):
    def __init__(self):
        super(feedforward_Encoder,self).__init__()
        self.l1 = nn.Linear(10,40)
        self.l2 = nn.Linear(40,10)
        
    def forward(self,x):
        y = F.relu(self.l1(x))
        y = self.l2(y)
        return y
        
    

In [4]:
r = 10
c = 5
query_weights = abs(torch.rand((r,c)))
key_weights = abs(torch.rand((r,c)))
value_weights = abs(torch.rand((r,c)))

params = []
params = params + list(query_weights) + list(key_weights) + list(value_weights)

In [5]:
class Encoder():
    def __init__(self):
        self.d_model = d_model
        self.new_dim = new_dim
        self.positional_encodings = None
        self.first_sublayer_output = None
        self.keys = None
        self.values = None
        


    def PositionalEncoding(self,wordVecs):
        for pos in range(wordVecs.shape[0]):
            for i in range(wordVecs[pos].shape[0]):
                if i%2 == 0:
                    wordVecs[pos][i] = wordVecs[pos][i] + math.sin(pos/(10000**(2*i/self.d_model)))
                else:
                    wordVecs[pos][i] = wordVecs[pos][i] + math.cos(pos/(10000**(2*i/self.d_model))) 
                    
        self.positional_encodings = wordVecs
        return wordVecs
    
    
    
    def qkvs(self,vectorMatrix, new_dim):
        return torch.matmul(vectorMatrix, query_weights), torch.matmul(vectorMatrix, key_weights), \
        torch.matmul(vectorMatrix, value_weights) 
        # Check for transposeness in matrix multiplication
    
    
    def qk_dotproducts(self,queries, keys):
        dotproduct_matrix = torch.Tensor([])
        for i in queries:
            dotproduct_vector = torch.Tensor([])
            for j in keys:
                dotproduct_vector = torch.cat([dotproduct_vector, torch.dot(i,j).reshape(-1)])
            dotproduct_matrix = torch.cat([dotproduct_matrix, dotproduct_vector.reshape(1,-1)])
        return dotproduct_matrix
    
    
    def getSoftmaxed_qkdp(self,qk_dotproductmatrix):
        sm = nn.Softmax(dim = 0)
        sm_matrix = torch.tensor([])
        for i in qk_dotproductmatrix:
            sm_matrix = torch.cat([sm_matrix, sm(i).reshape(1,-1)])
        return sm_matrix
    
    
    def getSoftmaxWeightedValues(self,softmaxed_qkdp, values):
        dim2_mat = torch.tensor([])
        dim3_mat = torch.tensor([])
        outer_loop_range = softmaxed_qkdp.shape[0]
        inner_loop_range = values.shape[0]
        for i in range(outer_loop_range):
            for j in range(inner_loop_range):
                dim2_mat = torch.cat([dim2_mat, (softmaxed_qkdp[i][j]*values[j]).reshape(-1)])
            dim3_mat = torch.cat([dim3_mat, dim2_mat.reshape(1,values.shape[0],values.shape[1])])
            dim2_mat = torch.tensor([]) 
        return dim3_mat
    
    
    
    def getWeightedSum(self,softmax_weighted_values):
        next_layer_input = torch.tensor([])
        for i in softmax_weighted_values:
            transposed_i = i.t()
            new_word_representation = torch.tensor([])
            for j in transposed_i:
                rowsum = j.sum()
                new_word_representation = torch.cat([new_word_representation, rowsum.reshape(-1)])
            next_layer_input = \
            torch.cat([next_layer_input, new_word_representation.reshape(1,new_word_representation.shape[0])])    
        return next_layer_input
        
    
    
    def returnRepresentation(self, vectorRepresentations):
        pos_encoded = self.PositionalEncoding(vectorRepresentations)
        new_dim = self.new_dim
        queries, keys, values = self.qkvs(pos_encoded, new_dim)
        qk_dotproductmatrix = self.qk_dotproducts(queries, keys)
        d_k = keys.shape[1] # to be changed later to square root of 'key' vector dimension
        qk_dotproductmatrix/=d_k
        softmaxed_qkdp = self.getSoftmaxed_qkdp(qk_dotproductmatrix)
        softmax_weighted_values = self.getSoftmaxWeightedValues(softmaxed_qkdp, values)
        weightedSum = self.getWeightedSum(softmax_weighted_values)
        return weightedSum  
    
    
    def getW0(self):
        self.t = torch.randn(self.d_model, self.d_model).float()
        return self.t
    
    
    
    def multiHeadAttention(self, vectorRepresentations, heads=2):
        listOfHeads = []
        op = torch.tensor([])
        for i in range(heads):
            temp = self.returnRepresentation(vectorRepresentations)
            listOfHeads.append(temp)
    
        outputRepresentation = torch.tensor([])
        for i in range(listOfHeads[0].shape[0]):
            outputRepresentation = torch.cat([listOfHeads[0][i],listOfHeads[1][i]])
            op = torch.cat([op, outputRepresentation.reshape(1,outputRepresentation.shape[0])])
        
        W0 = self.getW0()
        projected_attention_vecs = torch.matmul(op, W0) 
        #Layer Normalisation
        layer_norm_one = nn.LayerNorm(projected_attention_vecs.size()[1])
        add_and_norm = layer_norm_one(projected_attention_vecs+self.positional_encodings)
        ##############   
        self.first_sublayer_output = add_and_norm
        return add_and_norm
    
    
    def ff_and_addnorm(self, vectorRepresentations):
        received_representations = self.multiHeadAttention(vectorRepresentations)
        activationlist = []
        activations = torch.tensor([])
        for i in received_representations:
            ffobj = feedforward_Encoder()
            activationlist.append(ffobj)
            activations = torch.cat([activations, activationlist[-1](i).reshape(1,received_representations.\
                                                                                shape[1])])
         
        layer_norm_two = nn.LayerNorm(activations.size()[1])
        add_and_norm = layer_norm_two(activations + self.first_sublayer_output)
        return add_and_norm
        
         
    def forward(self, vectorRepresentations):
        return self.ff_and_addnorm(vectorRepresentations)


# Decoder

In [35]:
r = 10
c = 5
decoder_query_weights = abs(torch.rand((r,c)))
decoder_key_weights = abs(torch.rand((r,c)))
decoder_value_weights = abs(torch.rand((r,c)))

decoder_masked_query_weights = abs(torch.rand((r,c)))
decoder_masked_key_weights = abs(torch.rand((r,c)))
decoder_masked_value_weights = abs(torch.rand((r,c)))



# params = []
# params = params + list(query_weights) + list(key_weights) + list(value_weights)

In [39]:
class Decoder():
    
    def __init__(self):
        self.vectorRepresentations = None
        self.positional_encodings = None
        self.d_model = d_model
        self.new_dim = new_dim
        self.maskedMultiHeadAttentionOutputVectors = None
        self.multiHeadAttentionOutputVectors = None
        
        
    def PositionalEncoding(self,wordVecs):
        for pos in range(wordVecs.shape[0]):
            for i in range(wordVecs[pos].shape[0]):
                if i%2 == 0:
                    wordVecs[pos][i] = wordVecs[pos][i] + math.sin(pos/(10000**(2*i/self.d_model)))
                else:
                    wordVecs[pos][i] = wordVecs[pos][i] + math.cos(pos/(10000**(2*i/self.d_model))) 
                    
        self.positional_encodings = wordVecs
        return wordVecs
    
    

    def qkvs_Attention(self,vectorMatrix):
        return torch.matmul(vectorMatrix, decoder_query_weights), torch.matmul(vectorMatrix, decoder_key_weights), \
        torch.matmul(vectorMatrix, decoder_value_weights) 
    
    
    
    def qkvs_maskedAttention(self,vectorMatrix):
        return torch.matmul(vectorMatrix, decoder_masked_query_weights), torch.matmul(vectorMatrix, decoder_masked_key_weights), \
        torch.matmul(vectorMatrix, decoder_masked_value_weights) 
    
    
    def maskedMatrix(self,m,ind):
        returnMatrix = torch.tensor([]).float()
        for i in range(m.shape[0]):
            if i<=ind:
                returnMatrix = torch.cat([returnMatrix,m[i].unsqueeze(0)])
            else:
                returnMatrix = torch.cat([returnMatrix,torch.tensor([-float('Inf') for k in range(m.shape[1])]).float().unsqueeze(0)])
        
        return returnMatrix
    
    
    def dotProductMaskedMatrix(self,l,m2):
        returnMatrix = torch.tensor([]).float()
        for i in range(m2.shape[0]):
            returnMatrix = torch.cat([returnMatrix,torch.dot(l,m2[i]).reshape(-1)])
       
        
        return returnMatrix
    
    
    def qk_dotproducts_maskedAttention(self,queries, keys):
        finalMatrix = torch.Tensor([])
        for i in range(queries.shape[0]):
            b = maskedMatrix(queries,i)
            c = dotProductMaskedMatrix(b[i],b)
            d = nn.Softmax(dim=0)(c)
        finalMatrix = torch.cat([finalMatrix,d.unsqueeze(0)])
    
        return finalMatrix
    
    
    def qk_dotproducts_Attention(self,queries, keys):
        dotproduct_matrix = torch.Tensor([])
        for i in queries:
            dotproduct_vector = torch.Tensor([])
            for j in keys:
                dotproduct_vector = torch.cat([dotproduct_vector, torch.dot(i,j).reshape(-1)])
            dotproduct_matrix = torch.cat([dotproduct_matrix, dotproduct_vector.reshape(1,-1)])
            
        return dotproduct_matrix
    
    
    def conditionedSoftmax(self, i):
        temp = torch.tensor([])
        softmax_tensor = torch.tensor([])
        sm = nn.Softmax(dim = 0)

        for element in i:
            if element != 0.0:
                temp = torch.cat([temp, element.reshape(-1)])
        softmax_tensor = sm(temp)
        for j in range(softmax_tensor.shape[0]):
            i[j] = softmax_tensor[j]

        return i
    
    
    
    def getSoftmaxed_qkdp(self,qk_dotproductmatrix):
        sm_matrix = torch.tensor([])
        for i in qk_dotproductmatrix:
            sm_matrix = torch.cat([sm_matrix, self.conditionedSoftmax(i).reshape(1,-1)])
        return sm_matrix
    
     
    def getSoftmaxWeightedValues(self,softmaxed_qkdp, values):
        dim2_mat = torch.tensor([])
        dim3_mat = torch.tensor([])
        outer_loop_range = softmaxed_qkdp.shape[0]
        inner_loop_range = values.shape[0]
        for i in range(outer_loop_range):
            for j in range(inner_loop_range):
                dim2_mat = torch.cat([dim2_mat, (softmaxed_qkdp[i][j]*values[j]).reshape(-1)])
            dim3_mat = torch.cat([dim3_mat, dim2_mat.reshape(1,values.shape[0],values.shape[1])])
            dim2_mat = torch.tensor([]) 
        return dim3_mat
    
    
    
    def getWeightedSum(self,softmax_weighted_values):
        next_layer_input = torch.tensor([])
        for i in softmax_weighted_values:
            transposed_i = i.t()
            new_word_representation = torch.tensor([])
            for j in transposed_i:
                rowsum = j.sum()
                new_word_representation = torch.cat([new_word_representation, rowsum.reshape(-1)])
            next_layer_input = \
            torch.cat([next_layer_input, new_word_representation.reshape(1,new_word_representation.shape[0])])    
        return next_layer_input
    
    
    
    
    def returnMaskedRepresentation(self, vectorRepresentations):
        pos_encoded = self.PositionalEncoding(vectorRepresentations)
        new_dim = self.new_dim
        queries, keys, values = self.qkvs_maskedAttention(pos_encoded)
        qk_dotproductmatrix = self.qk_dotproducts_maskedAttention(queries, keys)
        
        
        
        
        
        d_k = keys.shape[1] # to be changed later to square root of 'key' vector dimension
        qk_dotproductmatrix/=d_k   
        softmax_weighted_values = self.getSoftmaxWeightedValues(qk_dotproductmatrix, values)
        weightedSum = self.getWeightedSum(softmax_weighted_values)
        return weightedSum 
        
 





     
    def getW0(self):
        self.t = torch.randn(self.d_model, self.d_model).float()
        return self.t
    
    
    def maskedMultiHeadAttention_add_norm(self, vectorRepresentations, heads=2):
        listOfHeads = []
        op = torch.tensor([])
        
        #Multiple Heads
        for i in range(heads):
            temp = self.returnMaskedRepresentation()
            listOfHeads.append(temp)
    
        outputRepresentation = torch.tensor([])
        for i in range(listOfHeads[0].shape[0]):
            outputRepresentation = torch.cat([listOfHeads[0][i],listOfHeads[1][i]])
            op = torch.cat([op, outputRepresentation.reshape(1,outputRepresentation.shape[0])])
        
        W0 = self.getW0()
        projected_attention_vecs = torch.matmul(op, W0) 
        #Layer Normalisation
        layer_norm_one = nn.LayerNorm(projected_attention_vecs.size()[1])
        add_and_norm_one = layer_norm_one(projected_attention_vecs+self.positional_encodings)
        ##############   
        self.first_sublayer_output = add_and_norm_one
        return add_and_norm_one
        
     
    
    def returnRepresentation(self, vectorRepresentations):
        inp_vectors = self.maskedMultiHeadAttention_add_norm(vectorRepresentations)
        self.maskedMultiHeadAttentionOutputVectors = inp_vectors
        new_dim = self.new_dim
        queries, keys, values = self.qkvs(inp_vectors, new_dim)
        del keys
        del values
        keys = encoder_keys
        values = encoder_values
        qk_dotproductmatrix = self.qk_dotproducts(queries, keys)
        d_k = keys.shape[1] # to be changed later to square root of 'key' vector dimension
        qk_dotproductmatrix/=d_k #In paper we divide by sqrt(d_k) that is sqrt(64) = 8
        softmaxed_qkdp = self.getSoftmaxed_qkdp(qk_dotproductmatrix)
        softmax_weighted_values = self.getSoftmaxWeightedValues(softmaxed_qkdp, values)
        weightedSum = self.getWeightedSum(softmax_weighted_values)
        return weightedSum  
    
    
    
    def multiHeadAttention_add_norm(self, vectorRepresentations, heads=2):
        
        listOfHeads = []
        op = torch.tensor([])
        
        #Multiple Heads
        for i in range(heads):
            temp = self.returnRepresentation(vectorRepresentations)
            listOfHeads.append(temp)
            
        outputRepresentation = torch.tensor([])
        for i in range(listOfHeads[0].shape[0]):
            outputRepresentation = torch.cat([listOfHeads[0][i],listOfHeads[1][i]])
            op = torch.cat([op, outputRepresentation.reshape(1,outputRepresentation.shape[0])])
        
        W0 = self.getW0()
        projected_attention_vecs = torch.matmul(op, W0) 
        #Layer Normalisation
        layer_norm_two = nn.LayerNorm(projected_attention_vecs.size()[1])
        add_and_norm_two = layer_norm_two(projected_attention_vecs + self.maskedMultiHeadAttentionOutputVectors)
        ##############   
        self.first_sublayer_output = add_and_norm_two
        return add_and_norm_two
    
    
    
    def ff_and_addnorm(self, vectorRepresentations):
        received_representations = self.multiHeadAttention_add_norm(vectorRepresentations)
        self.multiHeadAttentionOutputVectors = received_representations
        activationlist = []
        
        activations = torch.tensor([])
        for i in received_representations:
            ffobj = feedforward_Encoder()
            activationlist.append(ffobj)
            activations = torch.cat([activations, activationlist[-1](i).reshape(1,received_representations.shape[1])])
         
        layer_norm_three = nn.LayerNorm(activations.size()[1])
        add_and_norm_three = layer_norm_three(activations + self.multiHeadAttentionOutputVectors)
        return add_and_norm_three
    
    
    def forward(self, vectorRepresentations):
        print(vectorRepresentations)
        return self.ff_and_addnorm(vectorRepresentations)
    

In [40]:
r1 = abs(torch.randn(5,10))
r2 = Decoder()
r2.forward(r1)

tensor([[0.8973, 0.0306, 0.3667, 0.3229, 0.5232, 0.6347, 0.0646, 0.5993, 0.0857,
         0.9511],
        [0.0503, 0.9271, 1.4507, 1.4921, 0.4589, 0.1918, 0.1472, 1.2900, 1.5106,
         1.3073],
        [1.0145, 1.1975, 0.7367, 0.3658, 0.2911, 2.0026, 0.8439, 0.0573, 0.0052,
         0.6477],
        [0.2373, 0.1808, 0.3104, 0.6364, 0.5896, 0.1469, 1.2421, 0.2624, 1.8989,
         1.4665],
        [0.8192, 0.5228, 2.1187, 0.2668, 0.5376, 0.1592, 0.8983, 0.0926, 0.0630,
         0.2670]])


In [12]:
vocab_size = 10
LinearLayer = nn.Sequential(
              nn.Linear(d_model, vocab_size),
              nn.ReLU(),
              nn.Softmax(dim=1)
)

In [None]:
# nltk.download('punkt')

germanSens = pickle.load(open(f'subsampledGermanSens.pkl', 'rb'))
germanSens = [i.lower() for i in germanSens]
germanVecs = [nltk.word_tokenize(sentence) for sentence in germanSens]


englishSens = pickle.load(open(f'subsampledEnglishSens.pkl', 'rb'))
englishSens = [i.lower() for i in englishSens]
englishVecs = [nltk.word_tokenize(sentence) for sentence in englishSens]



modelGerman = Word2Vec(germanVecs, min_count=1, size=10)
modelEnglish = Word2Vec(englishVecs, min_count=1, size=10)

In [None]:
def getWordVecs(listOfTokens,lang):
    wvecs = torch.tensor([]).float()
    
    for i in listOfTokens:
        if lang == 'en':
            wvecs = torch.cat([wvecs, torch.from_numpy(modelEnglish.wv[i]).unsqueeze(0)])
        elif lang == 'de':
            wvecs = torch.cat([wvecs, torch.from_numpy(modelGerman.wv[i]).unsqueeze(0)])

    return wvecs

In [None]:
encoding_layer = Encoder()
decoding_layer = Decoder()
# decoding_layer = Decoder()
encoder_keys, encoder_values = key_weights, value_weights
for i in range(100):
    english_wordVecs = encoding_layer.forward(getWordVecs(englishVecs[i],'en'))

    


In [13]:
decoder1 = Decoder()
# decoder2 = Decoder()

# word = french_wordVecs[0].reshape(1,-1)
for word in french_wordVecs:
    out = decoder1.forward(word.reshape(1,-1))
#     out = decoder2.forward(out)
    projected = LinearLayer(out)
    maxvalue, index = torch.max(projected,1)
    print(maxvalue[0].item(), '\t',index[0].item())

0.16061271727085114 	 5
0.1924039125442505 	 4
0.25766465067863464 	 7
0.14788836240768433 	 5
0.2986910045146942 	 7
0.15265703201293945 	 5


In [9]:
def maskedMatrix(m,ind):
    returnMatrix = torch.tensor([]).float()
    for i in range(m.shape[0]):
        if i<=ind:
            returnMatrix = torch.cat([returnMatrix,m[i].unsqueeze(0)])
        else:
            returnMatrix = torch.cat([returnMatrix,torch.tensor([-float('Inf') for k in range(m.shape[1])]).float().unsqueeze(0)])
        
    return returnMatrix

In [15]:
def dotProductMaskedMatrix(l,m2):
    returnMatrix = torch.tensor([]).float()
    for i in range(m2.shape[0]):
        returnMatrix = torch.cat([returnMatrix,torch.dot(l,m2[i]).reshape(-1)])
       
        
    return returnMatrix
    
    

In [17]:
import torch
import torch.nn as nn
a = abs(torch.randn(5,5))

In [18]:
finalMatrix = torch.Tensor([])
for i in range(a.shape[0]):
    b = maskedMatrix(a,i)
    c = dotProductMaskedMatrix(b[i],b)
    d = nn.Softmax(dim=0)(c)
    finalMatrix = torch.cat([finalMatrix,d.unsqueeze(0)])

In [19]:
finalMatrix

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0014, 0.9986, 0.0000, 0.0000, 0.0000],
        [0.1456, 0.3160, 0.5384, 0.0000, 0.0000],
        [0.1081, 0.2577, 0.2031, 0.4311, 0.0000],
        [0.1173, 0.3920, 0.1782, 0.2141, 0.0983]])

tensor([[1.4881, 1.0701, 1.7376, 1.0919, 1.4200]])

In [None]:
def returnMaskedRepresentation(self, vectorRepresentations):