In [1]:
from utils import glove_embeddings
import scipy

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.set_default_device(device)

# Assuming glove_embeddings is a PyTorch tensor
vocab_len = len(glove_embeddings)
num_hidden_layer_neurons = 128
max_input_length = 25
max_output_length = 100

W1 = torch.rand(max_input_length, num_hidden_layer_neurons)
W2 = torch.rand(num_hidden_layer_neurons, max_output_length)

b1 = torch.randn(num_hidden_layer_neurons)
b2 = torch.randn(max_output_length)

assert W1.shape == (max_input_length, num_hidden_layer_neurons)
assert W2.shape == (num_hidden_layer_neurons, max_output_length)

class FeedForwardNetwork(nn.Module):
    def __init__(self):
        super(FeedForwardNetwork, self).__init__()
        self.W1 = nn.Parameter(W1)
        self.W2 = nn.Parameter(W2)
        self.b1 = nn.Parameter(b1)
        self.b2 = nn.Parameter(b2)

    def forward(self, x):
        # x will be the context vector (output of the attention heads)
        x = torch.matmul(x, self.W1) + self.b1
        x = F.relu(x)  # ReLU activation
        x = torch.matmul(x, self.W2) + self.b2
        return x

def scaled_dot_product_attention(query, key, value, mask=None):
    attention_weights_matrix = torch.matmul(key, query.transpose(0, 1)) / torch.sqrt(torch.tensor(len(query[-1])))
    softmaxed_attention_weights_matrix = F.softmax(attention_weights_matrix, dim=-1)
    context_vector = torch.matmul(softmaxed_attention_weights_matrix, value)
    return context_vector

# Usage example
context_vector = torch.randn(max_input_length)
ffn = FeedForwardNetwork()
output = ffn(context_vector)
print(output.shape)


torch.Size([100])
