In [2]:
import torch 
import torch.nn as nn
import fasttext as ft
import math

In [3]:
class SelfAttention():
    def __init__(self, vectorRepresentations):
        self.vectorRepresentations = vectorRepresentations
        self.d_model = 10
        self.new_dim = 5


    def PositionalEncoding(self,wordVecs):
        for pos in range(wordVecs.shape[0]):
            for i in range(wordVecs[pos].shape[0]):
                if i%2 == 0:
                    wordVecs[pos][i] = wordVecs[pos][i] + math.sin(pos/(10000**(2*i/self.d_model)))
                else:
                    wordVecs[pos][i] = wordVecs[pos][i] + math.cos(pos/(10000**(2*i/self.d_model)))            
        return wordVecs


    def get_qkv_weights(self,r,c):
        query_weights = torch.rand((r,c))
        key_weights = torch.rand((r,c))
        value_weights = torch.rand((r,c))
        return query_weights, key_weights, value_weights
    
    
    
    def qkvs(self,vectorMatrix, new_dim):
        query_weights, key_weights, value_weights = self.get_qkv_weights(self.d_model,new_dim)
        return torch.matmul(vectorMatrix, query_weights), torch.matmul(vectorMatrix, key_weights), \
        torch.matmul(vectorMatrix, value_weights) 
        # Check for transposeness in matrix multiplication
    
    
    def qk_dotproducts(self,queries, keys):
        dotproduct_matrix = torch.Tensor([])
        for i in queries:
            dotproduct_vector = torch.Tensor([])
            for j in keys:
                dotproduct_vector = torch.cat([dotproduct_vector, torch.dot(i,j).reshape(-1)])
            dotproduct_matrix = torch.cat([dotproduct_matrix, dotproduct_vector.reshape(1,-1)])
        return dotproduct_matrix
    
    
    def getSoftmaxed_qkdp(self,qk_dotproductmatrix):
        sm = nn.Softmax(dim = 0)
        sm_matrix = torch.tensor([])
        for i in qk_dotproductmatrix:
            sm_matrix = torch.cat([sm_matrix, sm(i).reshape(1,-1)])
        return sm_matrix
    
    
    def getSoftmaxWeightedValues(self,softmaxed_qkdp, values):
        dim2_mat = torch.tensor([])
        dim3_mat = torch.tensor([])
        outer_loop_range = softmaxed_qkdp.shape[0]
        inner_loop_range = values.shape[0]
        for i in range(outer_loop_range):
            for j in range(inner_loop_range):
                dim2_mat = torch.cat([dim2_mat, (softmaxed_qkdp[i][j]*values[j]).reshape(-1)])
            dim3_mat = torch.cat([dim3_mat, dim2_mat.reshape(1,values.shape[0],values.shape[1])])
            dim2_mat = torch.tensor([]) 
        return dim3_mat
    
    
    
    def getWeightedSum(self,softmax_weighted_values):
        next_layer_input = torch.tensor([])
        for i in softmax_weighted_values:
            transposed_i = i.t()
            new_word_representation = torch.tensor([])
            for j in transposed_i:
                rowsum = j.sum()
                new_word_representation = torch.cat([new_word_representation, rowsum.reshape(-1)])
            next_layer_input = \
            torch.cat([next_layer_input, new_word_representation.reshape(1,new_word_representation.shape[0])])    
        return next_layer_input
        
    
    
    def returnRepresentation(self):
        pos_encoded = self.PositionalEncoding(self.vectorRepresentations)
        new_dim = self.new_dim
        queries, keys, values = self.qkvs(pos_encoded, new_dim)
        qk_dotproductmatrix = self.qk_dotproducts(queries, keys)
        d_k = keys.shape[1] # to be changed later to square root of 'key' vector dimension
        qk_dotproductmatrix/=d_k
        softmaxed_qkdp = self.getSoftmaxed_qkdp(qk_dotproductmatrix)
        softmax_weighted_values = self.getSoftmaxWeightedValues(softmaxed_qkdp, values)
        weightedSum = self.getWeightedSum(softmax_weighted_values)
        return weightedSum                                           


In [4]:
def getWordVectors(sentence):
    sentence = sentence.split(' ')
    vecs = torch.rand((len(sentence),10))
    return vecs

In [5]:
wordVecs = getWordVectors('Hi there this is nuts')
wordVecs

tensor([[0.1249, 0.2305, 0.8137, 0.9940, 0.5596, 0.9809, 0.0052, 0.5765, 0.5092,
         0.3259],
        [0.5036, 0.3139, 0.7213, 0.0191, 0.4861, 0.8933, 0.5636, 0.2686, 0.0942,
         0.9750],
        [0.9305, 0.4522, 0.7052, 0.7546, 0.3413, 0.0745, 0.5006, 0.5105, 0.4106,
         0.4684],
        [0.4926, 0.1672, 0.9406, 0.7823, 0.2219, 0.3925, 0.1152, 0.1543, 0.9985,
         0.2329],
        [0.1314, 0.8438, 0.1214, 0.4164, 0.1291, 0.5644, 0.9300, 0.1889, 0.8463,
         0.5379]])

In [20]:
def multiHeadAttention(wordVecs, heads=2):
    sa = SelfAttention(wordVecs)
    listOfHeads = []
    op = torch.tensor([])
    for i in range(heads):
        temp = sa.returnRepresentation()
#         print(temp)
        listOfHeads.append(temp)
    
    outputRepresentation = torch.tensor([])
    for i in range(listOfHeads[0].shape[0]):
        outputRepresentation = torch.cat([listOfHeads[0][i],listOfHeads[1][i]])
        op = torch.cat([op, outputRepresentation.reshape(1,outputRepresentation.shape[0])])
    
    
    return op

In [21]:
a = multiHeadAttention(wordVecs)
a

tensor([[28.8738, 45.4530, 39.0485, 41.7196, 24.2624, 36.6069, 35.2012, 16.4551,
         34.2018, 28.1001],
        [28.8738, 45.4530, 39.0485, 41.7196, 24.2624, 36.6069, 35.2012, 16.4551,
         34.2018, 28.1001],
        [28.8738, 45.4530, 39.0485, 41.7196, 24.2624, 36.6069, 35.2012, 16.4551,
         34.2018, 28.1001],
        [28.8738, 45.4530, 39.0485, 41.7196, 24.2624, 36.6069, 35.2012, 16.4551,
         34.2018, 28.1001],
        [28.8738, 45.4530, 39.0485, 41.7196, 24.2624, 36.6069, 35.2012, 16.4551,
         34.2018, 28.1001]])