<a href="https://colab.research.google.com/github/pmadhyastha/INM434/blob/main/attention_and_transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


In [None]:
training_data = [
    ("The cat is on the mat", ["DET", "NOUN", "VERB", "ADP", "DET", "NOUN"]),
    ("The dog ate my homework", ["DET", "NOUN", "VERB", "DET", "NOUN"]),
    ("I love to eat pizza", ["PRON", "VERB", "TO", "VERB", "NOUN"]),
    ("She runs every morning", ["PRON", "VERB", "ADV", "NOUN"]),
    ("He drinks coffee and tea", ["PRON", "VERB", "NOUN", "CONJ", "NOUN"]),
]

test_data = [
    ("The bird is flying", ["DET", "NOUN", "VERB", "VERB"]),
    ("They are playing soccer", ["PRON", "VERB", "VERB", "NOUN"]),
]


In [None]:
class POSDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence, tags = self.data[idx]
        sentence = sentence.lower().split()
        return sentence, tags


In [None]:
def build_vocab(data):
    vocab = set()
    for sentence, _ in data:
        for word in sentence.lower().split():
            vocab.add(word)
    return vocab


In [None]:
class POSModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, sentence):
        embedded = self.embedding(sentence)

        outputs, hidden = self.rnn(embedded)
        attention_weights = torch.softmax(torch.bmm(outputs, hidden[0].unsqueeze(2)), dim=1)
        attention = torch.bmm(outputs.transpose(1, 2), attention_weights).squeeze(2)

        output = self.fc(attention)

        return output


In [None]:
VOCAB_SIZE = len(build_vocab(training_data))
EMBEDDING_DIM = 100
HIDDEN_DIM = 100
OUTPUT_DIM = len(set(tag for _, tags in training_data for tag in tags))

model = POSModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)


In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())


In [None]:
def train(model, train_data, loss_fn, optimizer, num_epochs):
    model.train()

    train_dataset = POSDataset(train_data)
    train_dataloader = DataLoader(train_dataset, batch_size=2)

    for epoch in range(num_epochs):
        total_loss = 0.0
        for i, (sentences, tags) in enumerate(train_dataloader):
            # Convert sentences and tags to PyTorch tensors
            sentences = torch.LongTensor([[vocab[word] for word in sentence] for sentence in sentences])
            tags = torch.LongTensor([[tag2idx[tag] for tag in sentence] for sentence in tags])

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(sentences)

            # Flatten the outputs and tags
            outputs = outputs.view(-1, outputs.shape[-1])
            tags = tags.view(-1)

            # Compute the loss
            loss = loss_fn(outputs, tags)
            total_loss += loss.item()

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

        # Print the average loss for the epoch
        print(f"Epoch {epoch + 1} loss: {total_loss / len(train_dataset)}")


In [None]:
def evaluate(model, test_data):
    model.eval()

    test_dataset = POSDataset(test_data)
    test_dataloader = DataLoader(test_dataset, batch_size=2)

    with torch.no_grad():
        total_correct = 0
        total_count = 0
        for sentences, tags in test_dataloader:
            # Convert sentences to PyTorch tensors
            sentences = torch.LongTensor([[vocab[word] for word in sentence] for sentence in sentences])

            # Forward pass
            outputs = model(sentences)

            # Convert the outputs to predicted tags
            _, predicted_tags = torch.max(outputs, dim=-1)
            predicted_tags = predicted_tags.tolist()

            # Compute the number of correct predictions
            for i, sentence in enumerate(predicted_tags):
                for j, tag in enumerate(sentence):
                    if tag == tag2idx[tags[i][j]]:
                        total_correct += 1
                    total_count += 1

        # Print the accuracy
        accuracy = total_correct / total_count
        print(f"Accuracy: {accuracy:.2f}")


In [None]:
# Build the vocabulary and tag dictionaries
vocab = {word: i for i, word in enumerate(build_vocab(training_data))}
tag2idx = {tag: i for i, tag in enumerate(set(tag for _, tags in training_data for tag in tags))}
idx2tag = {i: tag for tag, i in tag2idx.items()}

# Train the model
train(model, train_dataloader, loss_fn, optimizer, num_epochs=10)

# Evaluate the model on the test data
evaluate(model, test_data)


TypeError: ignored

In [None]:
import torch
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.rnn = nn.RNN(input_dim, hidden_dim)
        self.attention = nn.Linear(hidden_dim, output_dim)

    def forward(self, input_seq):
        outputs = []
        for input in input_seq:
            hidden_state = self.rnn(input)
            attention_weights = self.attention(hidden_state)
            output = torch.matmul(attention_weights, input)
            outputs.append(output)
        return outputs

def main():
    # Create the training data
    training_data = []
    for i in range(100):
        input_seq = [torch.randint(10, 100) for _ in range(10)]
        output_seq = [torch.randint(10, 10) for _ in range(10)]
        training_data.append((input_seq, output_seq))

    # Create the model
    model = RNN(10, 10, 10)

    # Train the model
    model.train(training_data, epochs=100)

    # Test the model
    test_data = []
    for i in range(10):
        input_seq = [torch.randint(10, 100) for _ in range(10)]
        output_seq = [torch.randint(10, 10) for _ in range(10)]
        test_data.append((input_seq, output_seq))

    # Get the predictions
    predictions = model(test_data)

    # Compute the accuracy
    accuracy = torch.sum(torch.argmax(predictions, axis=1) == output_seq) / 10

    # Print the accuracy
    print("Accuracy:", accuracy)

if __name__ == "__main__":
    main()

TypeError: ignored

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Training corpus
train_data = [[("The", "DT"), ("dog", "NN"), ("ran", "VBD"), ("across", "IN"), ("the", "DT"), ("field", "NN")],
              [("The", "DT"), ("cat", "NN"), ("jumped", "VBD"), ("over", "IN"), ("the", "DT"), ("moon", "NN")]]

# Test corpus
test_data = [("The", "DT"), ("cow", "NN"), ("jumped", "VBD"), ("over", "IN"), ("the", "DT"), ("moon", "NN")]

# Create dictionary for mapping words to indices
word2idx = {}
for sentence in train_data:
    for word, _ in sentence:
        if word not in word2idx:
            word2idx[word] = len(word2idx)

# Create dictionary for mapping tags to indices
tag2idx = {"DT": 0, "NN": 1, "VBD": 2, "IN": 3}

# Define RNN with attention-based modeling
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.attn = nn.Linear(hidden_size * 2, 1)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.lstm(embedded, hidden)
        attn_weights = torch.softmax(self.attn(torch.cat((output[0], hidden[0].squeeze(0)), 1)), dim=1)
        output = output.transpose(0, 2).transpose(1, 2)  # Transpose to (1, hidden_size, 1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0), output.squeeze(0).unsqueeze(0))
        output = self.out(attn_applied[0])
        return output, hidden, attn_weights

    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_size),
                torch.zeros(1, 1, self.hidden_size))



# Initialize model and optimizer
model = RNN(len(word2idx), 128, len(tag2idx))
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Train the model
for epoch in range(100):
    for sentence in train_data:
        model.zero_grad()
        hidden = model.init_hidden()
        sentence_tensor = torch.tensor([word2idx[word] for word, _ in sentence], dtype=torch.long)
        targets = torch.tensor([tag2idx[tag] for _, tag in sentence], dtype=torch.long)
        loss = 0
        for i in range(len(sentence)):
            output, hidden, attn_weights = model(sentence_tensor[i], hidden)
            loss += nn.functional.cross_entropy(output, targets[i].unsqueeze(0))
        loss.backward()
        optimizer.step()

RuntimeError: ignored

In [None]:
def randint(low, high):
    return torch.randint(low, high+1)


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Define LSTM model
class LSTMTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

# Define training data and word-to-index and tag-to-index dictionaries
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {word: i for i, word in enumerate(set(word for sent, _ in training_data for word in sent))}
tag_to_ix = {tag: i for i, tag in enumerate(set(tag for _, tags in training_data for tag in tags))}
ix_to_tag = {v: k for k, v in tag_to_ix.items()}

print(tag_to_ix)
print(ix_to_tag)
# Define hyperparameters
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

# Initialize model, loss function, and optimizer
model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Train the model
for epoch in range(300):
    for sentence, tags in training_data:
        # Clear out gradients
        model.zero_grad()

        # Prepare inputs for the network
        sentence_in = torch.tensor([word_to_ix[word] for word in sentence], dtype=torch.long)
        targets = torch.tensor([tag_to_ix[tag] for tag in tags], dtype=torch.long)

        # Forward pass
        tag_scores = model(sentence_in)

        # Compute loss, gradients, and update parameters
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# Test the model on a sample sentence
with torch.no_grad():
    inputs = torch.tensor([word_to_ix[word] for word in training_data[0][0]], dtype=torch.long)
    tag_scores = model(inputs)
    print(tag_scores)

# Define some test sentences and their corresponding tags
test_data = [
    ("The dog read the book".split(), ["DET", "NN", "V", "DET", "NN"])
]

# Evaluate the model on the test set
with torch.no_grad():
    for sentence, tags in test_data:
        # Prepare the input sequence
        inputs = torch.tensor([word_to_ix[word] for word in test_data[0][0]], dtype=torch.long)

        # Pass the input sequence through the model
        tag_scores = model(inputs)
        print(tag_scores.argmax(dim=0).numpy())

        # Get the predicted tags by taking the argmax of the tag scores

        predicted_tags = [ix_to_tag[tag] for tag in tag_scores.argmax(dim=1).numpy()]

        # Print the sentence, true tags, and predicted tags
        print("Sentence: ", sentence)
        print("True tags: ", tags)
        print("Predicted tags: ", predicted_tags)
        print()



{'V': 0, 'DET': 1, 'NN': 2}
{0: 'V', 1: 'DET', 2: 'NN'}
tensor([[-2.9095, -0.1710, -2.2765],
        [-4.1558, -4.0201, -0.0342],
        [-0.0291, -4.4511, -4.0730],
        [-4.1403, -0.0289, -4.3800],
        [-4.6887, -3.8625, -0.0307]])
[2 3 4]
Sentence:  ['The', 'dog', 'read', 'the', 'book']
True tags:  ['DET', 'NN', 'V', 'DET', 'NN']
Predicted tags:  ['DET', 'NN', 'V', 'DET', 'NN']



In [None]:
training_data[0][0]

['The', 'dog', 'ate', 'the', 'apple']

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class LSTMTaggerWithAttention(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTaggerWithAttention, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)
        self.attention = nn.Linear(hidden_dim*2, 1)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))

        # Compute attention weights and weighted representation
        attn_weights = F.softmax(self.attention(lstm_out), dim=0).transpose(0, 1)
        attn_applied = torch.bmm(attn_weights, lstm_out.transpose(0, 1))

        tag_space = self.hidden2tag(attn_applied.view(len(sentence), -1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

# Define training data and word-to-index and tag-to-index dictionaries
training_data = [
    ("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
word_to_ix = {word: i for i, word in enumerate(set(word for sent, _ in training_data for word in sent))}
tag_to_ix = {tag: i for i, tag in enumerate(set(tag for _, tags in training_data for tag in tags))}
ix_to_tag = {v: k for k, v in tag_to_ix.items()}

print(tag_to_ix)
print(ix_to_tag)

# Define hyperparameters
EMBEDDING_DIM = 6
HIDDEN_DIM = 6

# Initialize model, loss function, and optimizer
model = LSTMTaggerWithAttention(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

# Train the model
for epoch in range(300):
    for sentence, tags in training_data:
        # Clear out gradients
        model.zero_grad()

        # Prepare inputs for the network
        sentence_in = torch.tensor([word_to_ix[word] for word in sentence], dtype=torch.long)
        targets = torch.tensor([tag_to_ix[tag] for tag in tags], dtype=torch.long)

        # Forward pass
        tag_scores = model(sentence_in.unsqueeze(0))

        # Compute loss, gradients, and update parameters
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()

# Test the model on a sample sentence
with torch.no_grad():
    inputs = torch.tensor([word_to_ix[word] for word in training_data[0][0]], dtype=torch.long)
    tag_scores = model(inputs.unsqueeze(0))
    print(tag_scores)

# Define some test sentences and their corresponding tags
test_data = [
    ("The dog read the book".split(), ["DET", "NN", "V", "DET", "NN"])
]

# Evaluate the model on the test set
with torch.no_grad():
    for sentence, tags in test_data:
        # Prepare the input sequence
        inputs = torch.tensor([word_to_ix[word] for word in test_data[0][0]], dtype=torch.long)

        # Pass the input sequence through the model
        tag_scores = model(inputs)
        print(tag_scores.argmax(dim=0).numpy())

        # Get the predicted tags by taking the argmax of the tag scores

        predicted_tags = [ix_to_tag[tag] for tag in tag_scores.argmax(dim=1).numpy()]

        # Print the sentence, true tags, and predicted tags
        print("Sentence: ", sentence)
        print("True tags: ", tags)
        print("Predicted tags: ", predicted_tags)
        print()


{'V': 0, 'DET': 1, 'NN': 2}
{0: 'V', 1: 'DET', 2: 'NN'}


RuntimeError: ignored

In [None]:
sentence = 'Life is short, eat dessert first'

dc = {s:i for i,s in enumerate(sorted(sentence.replace(',', '').split()))}
print(dc)

import torch

sentence_int = torch.tensor([dc[s] for s in sentence.replace(',', '').split()])
print(sentence_int)

torch.manual_seed(123)
embed = torch.nn.Embedding(6, 16)
embedded_sentence = embed(sentence_int).detach()

print(embedded_sentence)
print(embedded_sentence.shape)

torch.manual_seed(123)

d = embedded_sentence.shape[1]

d_q, d_k, d_v = 24, 24, 28

W_query = torch.rand(d_q, d)
W_key = torch.rand(d_k, d)
W_value = torch.rand(d_v, d)

x_2 = embedded_sentence[1]
query_2 = W_query.matmul(x_2)
key_2 = W_key.matmul(x_2)
value_2 = W_value.matmul(x_2)

print(query_2.shape)
print(key_2.shape)
print(value_2.shape)

keys = W_key.matmul(embedded_sentence.T).T
values = W_value.matmul(embedded_sentence.T).T

print("keys.shape:", keys.shape)
print("values.shape:", values.shape)


omega_24 = query_2.dot(keys[4])
print(omega_24)

omega_2 = query_2.matmul(keys.T)
print(omega_2)

import torch.nn.functional as F

attention_weights_2 = F.softmax(omega_2 / d_k**0.5, dim=0)
print(attention_weights_2)

context_vector_2 = attention_weights_2.matmul(values)

print(context_vector_2.shape)
print(context_vector_2)


h = 3
multihead_W_query = torch.rand(h, d_q, d)
multihead_W_key = torch.rand(h, d_k, d)
multihead_W_value = torch.rand(h, d_v, d)


multihead_query_2 = multihead_W_query.matmul(x_2)
print(multihead_query_2.shape)

stacked_inputs = embedded_sentence.T.repeat(3, 1, 1)
print(stacked_inputs.shape)


multihead_keys = torch.bmm(multihead_W_key, stacked_inputs)
multihead_values = torch.bmm(multihead_W_value, stacked_inputs)
print("multihead_keys.shape:", multihead_keys.shape)
print("multihead_values.shape:", multihead_values.shape)


multihead_keys = multihead_keys.permute(0, 2, 1)
multihead_values = multihead_values.permute(0, 2, 1)
print("multihead_keys.shape:", multihead_keys.shape)
print("multihead_values.shape:", multihead_values.shape)


torch.manual_seed(123)

d = embedded_sentence.shape[1]
print("embedded_sentence.shape:", embedded_sentence.shape)

d_q, d_k, d_v = 24, 24, 28

W_query = torch.rand(d_q, d)
W_key = torch.rand(d_k, d)
W_value = torch.rand(d_v, d)

x_2 = embedded_sentence[1]
query_2 = W_query.matmul(x_2)
print("query.shape", query_2.shape)

keys = W_key.matmul(embedded_sentence.T).T
values = W_value.matmul(embedded_sentence.T).T

print("keys.shape:", keys.shape)
print("values.shape:", values.shape)



embedded_sentence_2 = torch.rand(8, 16) # 2nd input sequence

keys = W_key.matmul(embedded_sentence_2.T).T
values = W_value.matmul(embedded_sentence_2.T).T

print("keys.shape:", keys.shape)
print("values.shape:", values.shape)

{'Life': 0, 'dessert': 1, 'eat': 2, 'first': 3, 'is': 4, 'short': 5}
tensor([0, 4, 5, 2, 1, 3])
tensor([[ 0.3374, -0.1778, -0.3035, -0.5880,  0.3486,  0.6603, -0.2196, -0.3792,
          0.7671, -1.1925,  0.6984, -1.4097,  0.1794,  1.8951,  0.4954,  0.2692],
        [ 0.5146,  0.9938, -0.2587, -1.0826, -0.0444,  1.6236, -2.3229,  1.0878,
          0.6716,  0.6933, -0.9487, -0.0765, -0.1526,  0.1167,  0.4403, -1.4465],
        [ 0.2553, -0.5496,  1.0042,  0.8272, -0.3948,  0.4892, -0.2168, -1.7472,
         -1.6025, -1.0764,  0.9031, -0.7218, -0.5951, -0.7112,  0.6230, -1.3729],
        [-1.3250,  0.1784, -2.1338,  1.0524, -0.3885, -0.9343, -0.4991, -1.0867,
          0.8805,  1.5542,  0.6266, -0.1755,  0.0983, -0.0935,  0.2662, -0.5850],
        [-0.0770, -1.0205, -0.1690,  0.9178,  1.5810,  1.3010,  1.2753, -0.2010,
          0.4965, -1.5723,  0.9666, -1.1481, -1.1589,  0.3255, -0.6315, -2.8400],
        [ 0.8768,  1.6221, -1.4779,  1.1331, -1.2203,  1.3139,  1.0533,  0.1388,
        

In [None]:
import torch
import torch.nn.functional as F

# Define the input sentence
input_sentence = 'Life is short, eat dessert first'

# Create a dictionary that maps each word to a unique index
word_to_index = {word: i for i, word in enumerate(sorted(input_sentence.replace(',', '').split()))}

# Convert the input sentence to a tensor of indices using the word_to_index dictionary
input_tensor = torch.tensor([word_to_index[word] for word in input_sentence.replace(',', '').split()])

# Define the embedding layer
embedding_layer = torch.nn.Embedding(len(word_to_index), 16)

# Embed the input tensor to get the embedded sentence
embedded_sentence = embedding_layer(input_tensor).detach()

# Define the size of the embedding
embedding_size = embedded_sentence.shape[1]

# Define the sizes of the query, key, and value vectors
query_size, key_size, value_size = 24, 24, 28

# Define the query, key, and value weight matrices
query_weights = torch.rand(query_size, embedding_size)
key_weights = torch.rand(key_size, embedding_size)
value_weights = torch.rand(value_size, embedding_size)

# Get the query, key, and value vectors for the second word in the input sentence
x_2 = embedded_sentence[1]
query_2 = query_weights.matmul(x_2)
key_2 = key_weights.matmul(x_2)
value_2 = value_weights.matmul(x_2)

# Compute the keys and values for all words in the input sentence
keys = key_weights.matmul(embedded_sentence.transpose(0, 1)).transpose(0, 1)
values = value_weights.matmul(embedded_sentence.transpose(0, 1)).transpose(0, 1)

# Compute the attention weights for the second word in the input sentence
omega_2 = query_2.matmul(keys.T)
attention_weights_2 = F.softmax(omega_2 / d_k**0.5, dim=0)


# Compute the context vector for the second word in the input sentence
context_vector_2 = attention_weights_2.matmul(values)

# Define the number of attention heads
num_heads = 3

# Define the query, key, and value weight matrices for the multi-head attention
multihead_query_weights = torch.rand(num_heads, query_size, embedding_size)
multihead_key_weights = torch.rand(num_heads, key_size, embedding_size)
multihead_value_weights = torch.rand(num_heads, value_size, embedding_size)

# Get the query vector for the second word in the input sentence for each attention head
multihead_query_2 = multihead_query_weights.matmul(x_2)

# Repeat the embedded sentence for each attention head
repeated_embedded_sentence = embedded_sentence.unsqueeze(0).repeat(num_heads, 1, 1)

# Compute the keys and values for all words in the input sentence for each attention head
multihead_keys = multihead_key_weights.matmul(repeated_embedded_sentence.transpose(1, 2)).transpose(1, 2)
multihead_values = multihead_value_weights.matmul(repeated_embedded_sentence.transpose(1, 2)).transpose(1, 2)

# Compute the attention weights and context vectors for each attention head
attention_weights = F.softmax(multihead_query_2.matmul(multihead_keys.transpose(1, 2)) / key_size**0.5, dim=2)
context_vectors = attention_weights.matmul(multihead_values)

# Create a random tensor to represent a second input sequence
embedded_sentence_2 = torch.rand(8, 16)

# Compute the keys and values for all words in the

keys = W_key.matmul(embedded_sentence_2.T).T
values = W_value.matmul(embedded_sentence_2.T).T

