In [213]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

In [214]:
input_text = "Artificial Intelligence is good"
input_sequences = input_text.split()
input_sequences

['Artificial', 'Intelligence', 'is', 'good']

In [None]:
# Assign token number for each word and add Standard Tokens like <SOS>, <EOS> etc.,.

tokens = []
vocab_dictionary = dict()
vocab_dictionary["<SOS>"] = 0
for i,x in enumerate(input_sequences):
    vocab_dictionary[x] = i + 1
    tokens.append(i)
for k,v in vocab_dictionary.items():
    print(f"Key: {k}, Value: {v}")

Key: <SOS>, Value: 0
Key: Artificial, Value: 1
Key: Intelligence, Value: 2
Key: is, Value: 3
Key: good, Value: 4


In [None]:
# Send the computed tokens to an Embedding layer to compute embeddings for each token

V = len(vocab_dictionary)
embedding_dim = 8
embedding_layer = nn.Embedding(V, embedding_dim)
embeddings = embedding_layer(torch.tensor(tokens, dtype = torch.int))
print(embeddings)
print(embeddings.shape)

tensor([[ 1.0893,  0.2141,  1.7323, -0.2315,  0.4474,  0.0501,  0.5508, -0.5660],
        [ 0.1910,  0.2929,  0.1459,  0.2864, -1.1144,  0.7417,  0.1784, -0.2458],
        [-0.0596, -1.1607,  0.1165,  0.5596, -0.2945, -0.2762, -1.1215,  0.9584],
        [-0.9528,  0.9762,  1.1463, -1.1762,  0.3640,  1.4789, -0.0288,  1.0853]],
       grad_fn=<EmbeddingBackward0>)
torch.Size([4, 8])


In [None]:
def pe_sin(pos, i, dmodel):                          # Positional Encoding (sin)
    val = pos / (10000 ** ((2 * i) / dmodel))
    return torch.sin(torch.tensor(val))

def pe_cos(pos, i, dmodel):                          # Positional Encoding (cos)
    val = pos / (10000 ** ((2 * i) / dmodel))
    return torch.cos(torch.tensor(val))

positional_encodings = []                            # Calculate the positional information for each token and add them to the embeddings resp.
for j in range(len(tokens)):
    temp = []
    for i in range(embedding_dim // 2):
        sin_val = pe_sin(j, i, embedding_dim)
        cos_val = pe_cos(j, i, embedding_dim)
        temp.append(sin_val.item())
        temp.append(cos_val.item())
    positional_encodings.append(temp)

for x in positional_encodings:
    print(x)


[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0]
[0.8414709568023682, 0.5403023362159729, 0.0998334214091301, 0.9950041770935059, 0.009999833069741726, 0.9999499917030334, 0.0009999999310821295, 0.9999995231628418]
[0.9092974066734314, -0.416146844625473, 0.19866932928562164, 0.9800665974617004, 0.019998665899038315, 0.9998000264167786, 0.0019999986980110407, 0.9999980330467224]
[0.14112000167369843, -0.9899924993515015, 0.29552021622657776, 0.9553365111351013, 0.029995499178767204, 0.9995500445365906, 0.0029999956022948027, 0.9999955296516418]


In [218]:
V

5

In [219]:
final_embeddings = []

for i in range(V-1):
    temp = []
    for j in range(embedding_dim):
        val = positional_encodings[i][j] + embeddings[i][j]
        temp.append(val)
    final_embeddings.append(temp)

for x in final_embeddings:
    print(x)

[tensor(1.0893, grad_fn=<AddBackward0>), tensor(1.2141, grad_fn=<AddBackward0>), tensor(1.7323, grad_fn=<AddBackward0>), tensor(0.7685, grad_fn=<AddBackward0>), tensor(0.4474, grad_fn=<AddBackward0>), tensor(1.0501, grad_fn=<AddBackward0>), tensor(0.5508, grad_fn=<AddBackward0>), tensor(0.4340, grad_fn=<AddBackward0>)]
[tensor(1.0325, grad_fn=<AddBackward0>), tensor(0.8332, grad_fn=<AddBackward0>), tensor(0.2457, grad_fn=<AddBackward0>), tensor(1.2814, grad_fn=<AddBackward0>), tensor(-1.1044, grad_fn=<AddBackward0>), tensor(1.7416, grad_fn=<AddBackward0>), tensor(0.1794, grad_fn=<AddBackward0>), tensor(0.7542, grad_fn=<AddBackward0>)]
[tensor(0.8497, grad_fn=<AddBackward0>), tensor(-1.5768, grad_fn=<AddBackward0>), tensor(0.3151, grad_fn=<AddBackward0>), tensor(1.5396, grad_fn=<AddBackward0>), tensor(-0.2745, grad_fn=<AddBackward0>), tensor(0.7236, grad_fn=<AddBackward0>), tensor(-1.1195, grad_fn=<AddBackward0>), tensor(1.9584, grad_fn=<AddBackward0>)]
[tensor(-0.8117, grad_fn=<AddBack

In [None]:
# Let number of heads be 2 (called as MHA - Multi Head Attention). 
# So we define 6 matrices with random weights

heads = 2
q1 = torch.rand(size = (8,8))
q2 = torch.rand(size = (8,8))
k1 = torch.rand(size = (4,8))
k2 = torch.rand(size = (8,8))
v1 = torch.rand(size = (8,8))
v2 = torch.rand(size = (4,8))

In [None]:
# Head 1 (Head Dimension = Embed_dim / number of heads)

q1 = torch.rand(embedding_dim, embedding_dim)       # Initialise with random weights
k1 = torch.rand(embedding_dim, embedding_dim)
v1 = torch.rand(embedding_dim, embedding_dim)

Q1 = torch.tensor(final_embeddings) @ q1            # Compute the Linear Projections
K1 = torch.tensor(final_embeddings) @ k1 
V1 = torch.tensor(final_embeddings) @ v1 

scores = Q1 @ K1.T                                  # Compute the Self Attention and we divide with head dimension to reduce the variance
scores_scaled = scores / torch.sqrt(torch.tensor(embedding_dim, dtype=torch.float32))
attention_weights = torch.softmax(scores_scaled, dim = -1)
output1 = attention_weights @ V1
output1


tensor([[3.7551, 4.7911, 4.8471, 1.5453, 3.7349, 3.0951, 3.6462, 4.3870],
        [3.7452, 4.7840, 4.8100, 1.5453, 3.7162, 3.0939, 3.6380, 4.3667],
        [3.3487, 4.4498, 3.4918, 1.5447, 3.0703, 2.9828, 3.3307, 3.6021],
        [3.7478, 4.7869, 4.8239, 1.5452, 3.7237, 3.0955, 3.6428, 4.3757]])

In [None]:
# Head 2 (we do the same)

q2 = torch.rand(embedding_dim, embedding_dim)
k2 = torch.rand(embedding_dim, embedding_dim)
v2 = torch.rand(embedding_dim, embedding_dim)

Q2 = torch.tensor(final_embeddings) @ q2  
K2 = torch.tensor(final_embeddings) @ k2
V2 = torch.tensor(final_embeddings) @ v2 

scores = Q2 @ K2.T 
scores_scaled = scores / torch.sqrt(torch.tensor(embedding_dim, dtype=torch.float32))
attention_weights = torch.softmax(scores_scaled, dim = -1)
output2 = attention_weights @ V2
output2

tensor([[2.0149, 4.4096, 3.1001, 2.5337, 3.5869, 4.0890, 2.7248, 2.3034],
        [2.0147, 4.4093, 3.1001, 2.5335, 3.5867, 4.0887, 2.7247, 2.3034],
        [1.9345, 4.2268, 3.1134, 2.4407, 3.4816, 3.9703, 2.6597, 2.3200],
        [2.0136, 4.4068, 3.1003, 2.5324, 3.5854, 4.0870, 2.7238, 2.3037]])

In [None]:
# Concatenate the attention results

head_results = [output1, output2]
new = torch.cat(head_results, dim = 1)
linear_transformer = torch.rand(size = (16,8))
mha_result = new @ linear_transformer
mha_result

tensor([[27.4424, 32.7114, 29.9706, 31.2633, 30.0390, 21.1723, 22.2792, 24.6661],
        [27.3981, 32.6641, 29.9042, 31.1961, 29.9862, 21.1387, 22.2426, 24.6168],
        [25.4571, 30.4337, 27.1171, 28.4429, 27.6653, 19.6086, 20.6311, 22.5667],
        [27.4122, 32.6777, 29.9266, 31.2203, 30.0026, 21.1470, 22.2534, 24.6347]])

In [None]:
# Apply LayerNorm functionality i.e. Layer Normalisation

for x in mha_result:
    mean1 = x.mean()
    var = x.std()
    for i in range(len(x)):
        x[i] = ( x[i] - mean1)/var
mha_result

tensor([[-1.5585e-04,  1.2264e+00,  5.8837e-01,  8.8930e-01,  6.0431e-01,
         -1.4597e+00, -1.2021e+00, -6.4643e-01],
        [ 1.1101e-03,  1.2296e+00,  5.8573e-01,  8.8710e-01,  6.0486e-01,
         -1.4591e+00, -1.2016e+00, -6.4772e-01],
        [ 5.5506e-02,  1.3297e+00,  4.8051e-01,  8.1997e-01,  6.2086e-01,
         -1.4419e+00, -1.1801e+00, -6.8453e-01],
        [ 6.6287e-04,  1.2280e+00,  5.8674e-01,  8.8830e-01,  6.0447e-01,
         -1.4597e+00, -1.2018e+00, -6.4674e-01]])

In [None]:
# Add the outputs of MHA and Layer Normalisation

for i in range(4):
    for j in range(embedding_dim):
        mha_result[i][j] = mha_result[i][j] + final_embeddings[i][j]
mha_result

tensor([[ 1.0891,  2.4405,  2.3207,  1.6578,  1.0517, -0.4096, -0.6513, -0.2125],
        [ 1.0336,  2.0627,  0.8315,  2.1685, -0.4996,  0.2826, -1.0222,  0.1065],
        [ 0.9052, -0.2471,  0.7957,  2.3596,  0.3463, -0.7182, -2.2996,  1.2739],
        [-0.8111,  1.2143,  2.0286,  0.6675,  0.9984,  1.0188, -1.2276,  1.4386]],
       grad_fn=<CopySlices>)

In [None]:
class FFNEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.seq = nn.Sequential(
            nn.Linear(8,16),
            nn.ReLU(),
            nn.Linear(16,8),
            nn.ReLU()
        )

    def forward(self,x):
        return self.seq(x)
    

In [None]:
# Define a Feed Forward Neural Network - must at end of every decoder

ffn = FFNEncoder()
ffn_output = ffn(mha_result)

In [None]:
# This is the output at the end of encoder

encoder_output = ffn_output + mha_result
encoder_output

tensor([[ 1.0891,  2.9015,  2.3574,  1.7746,  1.0517, -0.4096, -0.6513,  0.1449],
        [ 1.0585,  2.4158,  1.0580,  2.3792, -0.4877,  0.2826, -1.0222,  0.3533],
        [ 1.0857,  0.2168,  1.0208,  2.4628,  0.3463, -0.7182, -2.2996,  1.7305],
        [-0.7969,  1.6482,  2.1086,  1.0375,  0.9984,  1.0188, -1.2276,  1.8351]],
       grad_fn=<AddBackward0>)

In [None]:
# For decoder we start with <SOS> and perform teacher forcing in training pahse and Autoregressive mode in Prediction phase

vocab_dict = {"<EOS>":0}
embedding_dim = 8
embedding_layer = nn.Embedding(len(vocab_dict), embedding_dim)

decoder_token = ["<EOS>"]
decoder_token_ids = [vocab_dict[token] for token in decoder_token]

embeddings = embedding_layer(torch.tensor(decoder_token_ids, dtype=torch.long))
embeddings = embeddings[0]
embeddings


tensor([ 0.1836,  0.2251,  0.9242,  0.5236, -0.0894, -1.3240, -2.0308, -0.1971],
       grad_fn=<SelectBackward0>)

In [230]:
def pe_sin(pos, i, dmodel):
    val = pos / (10000 ** ((2 * i) / dmodel))
    return torch.sin(torch.tensor(val))

def pe_cos(pos, i, dmodel):
    val = pos / (10000 ** ((2 * i) / dmodel))
    return torch.cos(torch.tensor(val))

positional_encodings = []
for i in range(4):
    sin_val = pe_sin(0, i, embedding_dim)
    cos_val = pe_cos(0, i, embedding_dim)
    positional_encodings.append(sin_val.item())
    positional_encodings.append(cos_val.item())

positional_encodings


[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0]

In [241]:
final_embeddings = (torch.tensor(embeddings) + torch.tensor(positional_encodings)).unsqueeze(0)
final_embeddings

  final_embeddings = (torch.tensor(embeddings) + torch.tensor(positional_encodings)).unsqueeze(0)


tensor([[ 0.1836,  1.2251,  0.9242,  1.5236, -0.0894, -0.3240, -2.0308,  0.8029]])

In [None]:
heads = 2
q1 = torch.rand(size = (8,8))
q2 = torch.rand(size = (8,8))
k1 = torch.rand(size = (1,8))
k2 = torch.rand(size = (8,8))
v1 = torch.rand(size = (8,8))
v2 = torch.rand(size = (1,8))


tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [243]:
final_embeddings

tensor([[ 0.1836,  1.2251,  0.9242,  1.5236, -0.0894, -0.3240, -2.0308,  0.8029]])

In [None]:
# Perform Masked Multi Head Attention to mask the future outputs before itself

embedding_dim = 8
final_embeddings = torch.rand(1, embedding_dim)

q1 = torch.rand(8, 8)
k1 = torch.rand(8, 8)
v1 = torch.rand(8, 8)

Q1 = final_embeddings @ q1
K1 = final_embeddings @ k1
V1 = final_embeddings @ v1

scores = Q1 @ K1.T
scores_scaled = scores / torch.sqrt(torch.tensor(embedding_dim, dtype=torch.float32))

mask = torch.tril(torch.ones_like(scores_scaled))
scores_masked = scores_scaled.masked_fill(mask == 0, float('-inf'))

attention_weights = torch.softmax(scores_masked, dim=-1)
output1 = attention_weights @ V1
print(output1)


tensor([[1.5074, 1.4913, 1.8286, 1.9777, 0.9197, 1.9518, 1.5603, 2.1625]])


In [None]:
# Apply Layer Normalisation

for x in output1:
    mean1 = x.mean()
    var = x.std()
    for i in range(len(x)):
        x[i] = ( x[i] - mean1)/var
output1

tensor([[-0.4274, -0.4686,  0.3921,  0.7726, -1.9272,  0.7065, -0.2924,  1.2443]])

In [None]:
# Compute the addition of outputs form both LayerNorm and Masked MHA

print(final_embeddings)
for i in range(1):
    for j in range(embedding_dim):
        output1[i][j] = output1[i][j] + final_embeddings[i][j]
output1

tensor([[0.5763, 0.2584, 0.2162, 0.1674, 0.2618, 0.7973, 0.8694, 0.4789]])


tensor([[ 3.0302,  1.0818,  1.6893,  1.7771, -0.3564,  5.4904,  4.9240,  4.1179]])

In [None]:
# Now the output is passed to Cross Attention (Q - from Decoder and K,V - form Encoder and remaining is the same)

q1 = torch.rand(embedding_dim, embedding_dim)
k1 = torch.rand(embedding_dim, embedding_dim)
v1 = torch.rand(embedding_dim, embedding_dim)

Q1 = encoder_output @ q1 
K1 = final_embeddings @ k1
V1 = final_embeddings @ v1

scores = Q1 @ K1.T
scores_scaled = scores / torch.sqrt(torch.tensor(embedding_dim, dtype=torch.float32))
attention_weights = torch.softmax(scores_scaled, dim=-1)
output2 = attention_weights @ V1
output2


tensor([[1.3961, 0.8158, 2.0619, 1.5589, 1.7509, 1.2019, 2.1174, 1.1989],
        [1.3961, 0.8158, 2.0619, 1.5589, 1.7509, 1.2019, 2.1174, 1.1989],
        [1.3961, 0.8158, 2.0619, 1.5589, 1.7509, 1.2019, 2.1174, 1.1989],
        [1.3961, 0.8158, 2.0619, 1.5589, 1.7509, 1.2019, 2.1174, 1.1989]],
       grad_fn=<MmBackward0>)

In [269]:
class FFNEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.seq = nn.Sequential(
            nn.Linear(8,16),
            nn.ReLU(),
            nn.Linear(16,8),
            nn.ReLU(),
            nn.Linear(8,8),
            nn.Softmax()
        )

    def forward(self,x):
        return self.seq(x)
    

In [None]:
# Define a FFN Network

ffn = FFNEncoder()
ffn_output = torch.tensor(ffn(output2))
x = ffn_output[0]

  return self._call_impl(*args, **kwargs)
  ffn_output = torch.tensor(ffn(output2))


In [271]:
x

tensor([0.1753, 0.1899, 0.1127, 0.0312, 0.1834, 0.1282, 0.1183, 0.0611])

In [272]:
y = np.argmax(x)
y

tensor(1)

In [2]:
from transformers import T5ForConditionalGeneration

model = T5ForConditionalGeneration.from_pretrained("t5-small")

encoder_layers = model.encoder.block
decoder_layers = model.decoder.block

first_encoder_layer = encoder_layers[0]
self_attention = first_encoder_layer.layer[0].SelfAttention

W_q = self_attention.q.weight
W_k = self_attention.k.weight
W_v = self_attention.v.weight

ffn = first_encoder_layer.layer[1].DenseReluDense
W_ff1 = ffn.wi.weight
W_ff2 = ffn.wo.weight


In [3]:
model.state_dict()

OrderedDict([('shared.weight',
              tensor([[ -2.0156,   0.2236,  -7.0938,  ...,  -0.3535,   2.6406,  -2.8906],
                      [ 12.6250,   8.1875, -11.6250,  ...,   7.9375,  -7.3125,   0.9453],
                      [ -8.7500,   7.1875,  27.8750,  ..., -26.7500,   0.8555,  -1.5156],
                      ...,
                      [-25.2500, -28.5000, -17.2500,  ..., -17.7500,  -5.2500,  27.3750],
                      [-25.5000, -29.3750, -18.2500,  ..., -17.7500,  -4.8125,  27.7500],
                      [-26.7500, -28.3750, -17.8750,  ..., -18.5000,  -7.0000,  27.6250]])),
             ('encoder.embed_tokens.weight',
              tensor([[ -2.0156,   0.2236,  -7.0938,  ...,  -0.3535,   2.6406,  -2.8906],
                      [ 12.6250,   8.1875, -11.6250,  ...,   7.9375,  -7.3125,   0.9453],
                      [ -8.7500,   7.1875,  27.8750,  ..., -26.7500,   0.8555,  -1.5156],
                      ...,
                      [-25.2500, -28.5000, -17.2500,  ...

In [4]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

model = T5ForConditionalGeneration.from_pretrained('t5-small', output_hidden_states=True)
tokenizer = T5Tokenizer.from_pretrained('t5-small')

input_text = "The house is wonderful"
inputs = tokenizer(input_text, return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs, decoder_input_ids=inputs["input_ids"])

encoder_embeddings = model.encoder.embed_tokens(inputs["input_ids"])
encoder_hidden_states = outputs.encoder_hidden_states
decoder_hidden_states = outputs.decoder_hidden_states
logits = outputs.logits


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


In [5]:
encoder_embeddings[0].shape

torch.Size([5, 512])

In [6]:
encoder_embeddings[0][0]

tensor([ 21.6250,  -0.2129,  20.1250,   2.5625,   2.9688,  -6.1875,  -4.2500,
         -7.9375,  -5.4062,   5.8125, -17.3750,   2.4062,   1.9062,   4.1562,
        -11.0000,  -4.9062,   0.5195,  20.3750, -13.0625,  -9.8125, -12.9375,
        -32.2500,   1.0625, -11.3125,  23.8750,  -8.3750,  -0.0986,   1.3125,
          3.0625, -12.9375,  10.1250,  -5.4375,   6.1875,  -8.1250,   1.8047,
        -10.9375,  -1.5156,  17.0000,   6.7500, -16.5000, -17.8750, -18.2500,
          9.5625,  17.1250,  16.8750,  -2.5156,   5.0938,  -3.5625, -17.1250,
         -7.0938,   4.4375,  15.4375,   8.4375, -17.5000,  14.3125,  -8.1875,
         10.8125,   0.3516,  10.7500,   3.2656, -34.2500,  11.9375, -11.1875,
         13.1250,  33.5000, -12.1250,   4.3438,   6.5312, -18.3750,   7.2812,
         -3.3281,  -4.2812, -11.2500,  15.1250,   6.9062,  16.7500,  12.5000,
        -25.8750,   5.2812,   5.5625, -10.0625,  10.6250,   7.9062,  13.6875,
         34.2500,  -0.7266,  -3.1094,  13.2500,  15.6250,  -3.18

In [7]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

embedding_weights = model.encoder.embed_tokens.weight.detach()


In [8]:
vocab = {
    "▁The": 37,
    "▁house": 629,
    "▁is": 19,
    "▁wonderful": 1627,
    "<pad>": 0,
    "<eos>": 1
}

def simple_tokenizer(text):
    tokens = text.strip().split()
    tokens = [f"▁{t}" for t in tokens]
    return tokens

def encode(text, vocab, add_special_tokens=True):
    tokens = simple_tokenizer(text)
    ids = []
    for token in tokens:
        if token in vocab:
            ids.append(vocab[token])
        else:
            ids.append(vocab["<pad>"])
    if add_special_tokens:
        ids.append(vocab["<eos>"])
    return ids

text = "The house is wonderful"
ids = encode(text, vocab)
print(ids) 


[37, 629, 19, 1627, 1]


In [9]:
text = "The house is wonderful"
input_ids = tokenizer.encode(text, add_special_tokens=True)

print("Token IDs:", input_ids)


Token IDs: [37, 629, 19, 1627, 1]


In [10]:
embedding_vectors = [embedding_weights[token_id] for token_id in input_ids]
embeddings = torch.stack(embedding_vectors, dim=0)

print("Embeddings shape:", embeddings.shape)
embeddings = embeddings.unsqueeze(0)


Embeddings shape: torch.Size([5, 512])


In [None]:
with torch.no_grad():
    expected_embeddings = model.encoder.embed_tokens()
print(torch.allclose(embeddings, expected_embeddings.squeeze(0)))
embeddings

True


tensor([[[ 21.6250,  -0.2129,  20.1250,  ..., -14.8750,  22.6250,  34.7500],
         [ 18.5000,   2.1719,  15.3750,  ...,  14.7500,  -9.4375,   3.0312],
         [  4.5625,  -1.1953, -13.9375,  ...,  24.5000,  -2.9375,  11.5000],
         [-13.2500,  -0.2461,  11.1875,  ..., -34.0000,   8.2500,   1.8906],
         [ 12.6250,   8.1875, -11.6250,  ...,   7.9375,  -7.3125,   0.9453]]])

In [12]:
encoder_hidden_states[0].shape

torch.Size([1, 5, 512])

In [13]:
model.eval()

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [None]:
import numpy as np

block = model.encoder.block[0]
Wq = block.layer[0].SelfAttention.q.weight.detach().cpu().numpy().T
Wk = block.layer[0].SelfAttention.k.weight.detach().cpu().numpy().T
Wv = block.layer[0].SelfAttention.v.weight.detach().cpu().numpy().T
Wo = block.layer[0].SelfAttention.o.weight.detach().cpu().numpy().T

W1 = block.layer[1].DenseReluDense.wi.weight.detach().cpu().numpy().T
b1 = block.layer[1].DenseReluDense.wi.bias
if b1 is not None:
    b1 = b1.detach().cpu().numpy()
else:
    b1 = np.zeros(W1.shape[1])

W2 = block.layer[1].DenseReluDense.wo.weight.detach().cpu().numpy().T
b2 = block.layer[1].DenseReluDense.wo.bias
if b2 is not None:
    b2 = b2.detach().cpu().numpy()
else:
    b2 = np.zeros(W2.shape[1])

ln0_gamma = block.layer[0].layer_norm.weight.detach().cpu().numpy()
ln0_beta = np.zeros_like(ln0_gamma)
ln1_gamma = block.layer[1].layer_norm.weight.detach().cpu().numpy()
ln1_beta = np.zeros_like(ln1_gamma)

In [89]:
print(Wq.shape)
print(Wk.shape)
print(Wv.shape)
print(Wo.shape)

print(W1.shape)
print(b1.shape)
print(W2.shape)
print(b2.shape)

print(ln0_gamma.shape)
print(ln0_beta.shape)
print(ln1_gamma.shape)
print(ln1_beta.shape)

(512, 512)
(512, 512)
(512, 512)
(512, 512)
(512, 2048)
(2048,)
(2048, 512)
(512,)
(512,)
(512,)
(512,)
(512,)


In [19]:
num_heads = 8
d_model = 512
head_dim = d_model // num_heads

def split_heads(x):
    batch_size, seq_len, d_model = x.shape
    x = x.reshape(batch_size, seq_len, num_heads, head_dim)
    return np.transpose(x, (0, 2, 1, 3))

In [None]:
def scaled_dot_product_attention(Q, K, V, relative_bias=None):
    d_k = Q.shape[-1]
    scores = np.matmul(Q, K.transpose(0,1,3,2)) / np.sqrt(d_k)
    scores_exp = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
    attention = scores_exp / np.sum(scores_exp, axis=-1, keepdims=True)
    output = np.matmul(attention, V)
    return output

def multi_head_attention(X, Wq, Wk, Wv, Wo, relative_bias=None):
    batch_size, seq_len, d_model = X.shape
    Q = X @ Wq 
    K = X @ Wk
    V = X @ Wv

    Q = split_heads(Q) 
    K = split_heads(K)
    V = split_heads(V)

    heads_output = scaled_dot_product_attention(Q, K, V, relative_bias)
    heads_output = heads_output.transpose(0, 2, 1, 3).reshape(batch_size, seq_len, d_model)
    output = heads_output @ Wo
    return output


In [21]:
def t5_layer_norm(x, gamma, eps=1e-6):
    rms = np.sqrt(np.mean(x ** 2, axis=-1, keepdims=True) + eps)
    norm_x = x / rms
    return norm_x * gamma

def feed_forward(x, W1, b1, W2, b2):
    hidden = np.maximum(0, x @ W1 + b1)
    output = hidden @ W2 + b2
    return output


def encoder_block1_forward(X, params):
    Wq, Wk, Wv, Wo, W1, b1, W2, b2, ln0_gamma, ln0_beta, ln1_gamma, ln1_beta = params
    normed1 = t5_layer_norm(X, ln0_gamma, ln0_beta)
    attn_out = multi_head_attention(normed1, Wq, Wk, Wv, Wo)
    res1 = X + attn_out
    normed2 = t5_layer_norm(res1, ln1_gamma, ln1_beta)
    ff_out = feed_forward(normed2, W1, b1, W2, b2)
    output = res1 + ff_out
    return output


In [22]:
embeddings_np = embeddings.squeeze(0).cpu().numpy()
embeddings_np = embeddings_np[np.newaxis, :, :]
params = (Wq, Wk, Wv, Wo, W1, b1, W2, b2, ln0_gamma, ln0_beta, ln1_gamma, ln1_beta)
encoder1_output = encoder_block1_forward(embeddings_np, params)
print("Encoder block 1 output shape:", encoder1_output.shape)


Encoder block 1 output shape: (1, 5, 512)


In [14]:
encoder_hidden_states[0]

tensor([[[ 21.6250,  -0.2129,  20.1250,  ..., -14.8750,  22.6250,  34.7500],
         [ 18.5000,   2.1719,  15.3750,  ...,  14.7500,  -9.4375,   3.0312],
         [  4.5625,  -1.1953, -13.9375,  ...,  24.5000,  -2.9375,  11.5000],
         [-13.2500,  -0.2461,  11.1875,  ..., -34.0000,   8.2500,   1.8906],
         [ 12.6250,   8.1875, -11.6250,  ...,   7.9375,  -7.3125,   0.9453]]])

In [None]:
encoder1_output[0]

array([[[ 21.615     ,  -0.22289063,  20.115     , ..., -14.885     ,
          22.615     ,  34.74      ],
        [ 18.49      ,   2.161875  ,  15.365     , ...,  14.74      ,
          -9.4475    ,   3.02125   ],
        [  4.5525    ,  -1.2053125 , -13.9475    , ...,  24.49      ,
          -2.9475    ,  11.49      ],
        [-13.26      ,  -0.25609374,  11.1775    , ..., -34.01      ,
           8.24      ,   1.880625  ],
        [ 12.615     ,   8.1775    , -11.635     , ...,   7.9275    ,
          -7.3225    ,   0.9353125 ]]], dtype=float32)

In [27]:
import numpy as np

block = model.encoder.block[1]
Wq = block.layer[0].SelfAttention.q.weight.detach().cpu().numpy().T
Wk = block.layer[0].SelfAttention.k.weight.detach().cpu().numpy().T
Wv = block.layer[0].SelfAttention.v.weight.detach().cpu().numpy().T
Wo = block.layer[0].SelfAttention.o.weight.detach().cpu().numpy().T

W1 = block.layer[1].DenseReluDense.wi.weight.detach().cpu().numpy().T
b1 = block.layer[1].DenseReluDense.wi.bias
if b1 is not None:
    b1 = b1.detach().cpu().numpy()
else:
    b1 = np.zeros(W1.shape[1])

W2 = block.layer[1].DenseReluDense.wo.weight.detach().cpu().numpy().T
b2 = block.layer[1].DenseReluDense.wo.bias
if b2 is not None:
    b2 = b2.detach().cpu().numpy()
else:
    b2 = np.zeros(W2.shape[1])

ln0_gamma = block.layer[0].layer_norm.weight.detach().cpu().numpy()
ln0_beta = np.zeros_like(ln0_gamma)

ln1_gamma = block.layer[1].layer_norm.weight.detach().cpu().numpy()
ln1_beta = np.zeros_like(ln1_gamma)



In [28]:
num_heads = 8
d_model = 512
head_dim = d_model // num_heads

def split_heads(x):
    batch_size, seq_len, d_model = x.shape
    x = x.reshape(batch_size, seq_len, num_heads, head_dim)
    return np.transpose(x, (0, 2, 1, 3))

In [None]:
def scaled_dot_product_attention(Q, K, V, relative_bias=None):
    d_k = Q.shape[-1]
    scores = np.matmul(Q, K.transpose(0,1,3,2)) / np.sqrt(d_k)
    scores_exp = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
    attention = scores_exp / np.sum(scores_exp, axis=-1, keepdims=True)
    output = np.matmul(attention, V)
    return output

def multi_head_attention(X, Wq, Wk, Wv, Wo, relative_bias=None):
    batch_size, seq_len, d_model = X.shape
    Q = X @ Wq 
    K = X @ Wk
    V = X @ Wv

    Q = split_heads(Q) 
    K = split_heads(K)
    V = split_heads(V)

    heads_output = scaled_dot_product_attention(Q, K, V, relative_bias)
    heads_output = heads_output.transpose(0, 2, 1, 3).reshape(batch_size, seq_len, d_model)
    output = heads_output @ Wo
    return output

In [223]:
def t5_layer_norm(x, gamma, eps=1e-6):
    rms = np.sqrt(np.mean(x ** 2, axis=-1, keepdims=True) + eps)
    norm_x = x / rms
    return norm_x * gamma

def feed_forward(x, W1, b1, W2, b2):
    hidden = np.maximum(0, x @ W1 + b1)
    output = hidden @ W2 + b2
    return output


def encoder_block1_forward(X, params):
    Wq, Wk, Wv, Wo, W1, b1, W2, b2, ln0_gamma, ln0_beta, ln1_gamma, ln1_beta = params
    normed1 = t5_layer_norm(X, ln0_gamma, ln0_beta)
    attn_out = multi_head_attention(normed1, Wq, Wk, Wv, Wo)
    res1 = X + attn_out
    normed2 = t5_layer_norm(res1, ln1_gamma, ln1_beta)
    ff_out = feed_forward(normed2, W1, b1, W2, b2)
    output = res1 + ff_out
    return output


In [None]:
embeddings_np = embeddings.squeeze(0).cpu().numpy()
embeddings_np = embeddings_np[np.newaxis, :, :]
params = (Wq, Wk, Wv, Wo, W1, b1, W2, b2, ln0_gamma, ln0_beta, ln1_gamma, ln1_beta)
encoder2_output = encoder_block1_forward(embeddings_np, params)


In [31]:
encoder_hidden_states[1]

tensor([[[  9.9676,  -8.1595,  22.0996,  ...,  -2.3901,  37.4178,  10.9345],
         [ 29.3336,  16.5984,   9.7027,  ...,  33.7214, -26.7108,  -7.5691],
         [ -0.5460,   3.1402, -26.4466,  ...,  33.3656, -10.2248,  16.7484],
         [-32.1817, -10.6298,  13.1618,  ..., -47.1465,  22.7782,  -4.9772],
         [ 21.1208,   4.6036, -11.7239,  ...,   9.7551,  -1.6436, -17.4189]]])

In [None]:
encoder2_output[0]

array([[[  9.947641  ,  -8.179463  ,  22.079641  , ...,  -2.4101138 ,
          37.39776   ,  10.914473  ],
        [ 29.313643  ,  16.57844   ,   9.682721  , ...,  33.701366  ,
         -26.73078   ,  -7.589145  ],
        [ -0.56602216,   3.1201806 , -26.466616  , ...,  33.345608  ,
         -10.244814  ,  16.728363  ],
        [-32.201694  , -10.649811  ,  13.141762  , ..., -47.166504  ,
          22.758228  ,  -4.9971957 ],
        [ 21.100798  ,   4.583566  , -11.743885  , ...,   9.735119  ,
          -1.6635518 , -17.43892   ]]], dtype=float32)

In [None]:
temp = encoder2_output[0]

for i in range(2,6):

    block = model.encoder.block[i]
    Wq = block.layer[0].SelfAttention.q.weight.detach().cpu().numpy().T
    Wk = block.layer[0].SelfAttention.k.weight.detach().cpu().numpy().T
    Wv = block.layer[0].SelfAttention.v.weight.detach().cpu().numpy().T
    Wo = block.layer[0].SelfAttention.o.weight.detach().cpu().numpy().T

    W1 = block.layer[1].DenseReluDense.wi.weight.detach().cpu().numpy().T
    b1 = block.layer[1].DenseReluDense.wi.bias
    if b1 is not None:
        b1 = b1.detach().cpu().numpy()
    else:
        b1 = np.zeros(W1.shape[1])

    W2 = block.layer[1].DenseReluDense.wo.weight.detach().cpu().numpy().T
    b2 = block.layer[1].DenseReluDense.wo.bias
    if b2 is not None:
        b2 = b2.detach().cpu().numpy()
    else:
        b2 = np.zeros(W2.shape[1])

    ln0_gamma = block.layer[0].layer_norm.weight.detach().cpu().numpy()
    ln0_beta = np.zeros_like(ln0_gamma)

    ln1_gamma = block.layer[1].layer_norm.weight.detach().cpu().numpy()
    ln1_beta = np.zeros_like(ln1_gamma)

    num_heads = 8
    d_model = 512
    head_dim = d_model // num_heads

    def split_heads(x):
        batch_size, seq_len, d_model = x.shape
        x = x.reshape(batch_size, seq_len, num_heads, head_dim)
        return np.transpose(x, (0, 2, 1, 3))
    
    def scaled_dot_product_attention(Q, K, V, relative_bias=None):
        d_k = Q.shape[-1]
        scores = np.matmul(Q, K.transpose(0,1,3,2)) / np.sqrt(d_k)
        if relative_bias is not None:
            scores += relative_bias
        scores_exp = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
        attention = scores_exp / np.sum(scores_exp, axis=-1, keepdims=True)
        output = np.matmul(attention, V)
        return output

    def multi_head_attention(X, Wq, Wk, Wv, Wo, relative_bias=None):
        batch_size, seq_len, d_model = X.shape
        Q = X @ Wq 
        K = X @ Wk
        V = X @ Wv

        Q = split_heads(Q) 
        K = split_heads(K)
        V = split_heads(V)

        heads_output = scaled_dot_product_attention(Q, K, V, relative_bias)
        heads_output = heads_output.transpose(0, 2, 1, 3).reshape(batch_size, seq_len, d_model)
        output = heads_output @ Wo
        return output
    
    def t5_layer_norm(x, gamma, eps=1e-6):
        rms = np.sqrt(np.mean(x ** 2, axis=-1, keepdims=True) + eps)
        norm_x = x / rms
        return norm_x * gamma

    def feed_forward(x, W1, b1, W2, b2):
        hidden = np.maximum(0, x @ W1 + b1)
        output = hidden @ W2 + b2
        return output


    def encoder_block1_forward(X, params):
        Wq, Wk, Wv, Wo, W1, b1, W2, b2, ln0_gamma, ln0_beta, ln1_gamma, ln1_beta = params
        normed1 = t5_layer_norm(X, ln0_gamma, ln0_beta)
        attn_out = multi_head_attention(normed1, Wq, Wk, Wv, Wo)
        res1 = X + attn_out
        normed2 = t5_layer_norm(res1, ln1_gamma, ln1_beta)
        ff_out = feed_forward(normed2, W1, b1, W2, b2)
        output = res1 + ff_out
        return output
    
    embeddings_np = embeddings.squeeze(0).cpu().numpy()
    embeddings_np = embeddings_np[np.newaxis, :, :]
    params = (Wq, Wk, Wv, Wo, W1, b1, W2, b2, ln0_gamma, ln0_beta, ln1_gamma, ln1_beta)
    previous_output = temp
    encoder_final_output = encoder_block1_forward(previous_output, params)
    temp = encoder_final_output



In [41]:
encoder_hidden_states[6]

tensor([[[ 0.1419, -0.0957,  0.0889,  ..., -0.2056,  0.0724, -0.0712],
         [-0.2174, -0.0906,  0.0813,  ...,  0.1371, -0.1766, -0.1062],
         [ 0.0115, -0.1079, -0.1664,  ..., -0.0255, -0.0383,  0.0166],
         [-0.0298, -0.0325,  0.0006,  ..., -0.2016,  0.0682, -0.0404],
         [ 0.1554,  0.0256, -0.0294,  ..., -0.0680,  0.1108, -0.0173]]])

In [None]:
encoder_final_output[0]

array([[[ 0.12488326, -0.11270864,  0.07194532, ..., -0.22257452,
          0.05540454, -0.08821604],
        [-0.2343867 , -0.10757293,  0.06425723, ...,  0.12013382,
         -0.19361147, -0.12317893],
        [-0.00548037, -0.12489459, -0.1834474 , ..., -0.04251202,
         -0.05530342, -0.00042997],
        [-0.04681292, -0.04945379, -0.01642574, ..., -0.21857388,
          0.05124101, -0.05737096],
        [ 0.13839199,  0.0086256 , -0.04635658, ..., -0.08501381,
          0.09383072, -0.03425261]]], dtype=float32)

In [None]:
mask = torch.rand(size=(len(tokens), len(tokens)))

for i in range(len(mask)):
    for j in range(len(mask[0])):
        if j <= i:
            mask[i][j] = 0
        else:
            mask[i][j] = float("-inf")

NameError: name 'torch' is not defined

In [50]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

decoder_input_text = "The answer is"
decoder_input_ids = tokenizer(decoder_input_text, return_tensors='pt').input_ids

decoder_embeddings = model.shared(decoder_input_ids)


In [None]:
block = model.decoder.block[0]

Wq_self = block.layer[0].SelfAttention.q.weight.detach().cpu().numpy().T
Wk_self = block.layer[0].SelfAttention.k.weight.detach().cpu().numpy().T
Wv_self = block.layer[0].SelfAttention.v.weight.detach().cpu().numpy().T
Wo_self = block.layer[0].SelfAttention.o.weight.detach().cpu().numpy().T

Wq_cross = block.layer[1].EncDecAttention.q.weight.detach().cpu().numpy().T
Wk_cross = block.layer[1].EncDecAttention.k.weight.detach().cpu().numpy().T
Wv_cross = block.layer[1].EncDecAttention.v.weight.detach().cpu().numpy().T
Wo_cross = block.layer[1].EncDecAttention.o.weight.detach().cpu().numpy().T

W1 = block.layer[2].DenseReluDense.wi.weight.detach().cpu().numpy().T
b1 = block.layer[2].DenseReluDense.wi.bias
if b1 is not None:
    b1 = b1.detach().cpu().numpy()
else:
    b1 = np.zeros(W1.shape[1])

W2 = block.layer[2].DenseReluDense.wo.weight.detach().cpu().numpy().T
b2 = block.layer[2].DenseReluDense.wo.bias
if b2 is not None:
    b2 = b2.detach().cpu().numpy()
else:
    b2 = np.zeros(W2.shape[1])

ln0_gamma = block.layer[0].layer_norm.weight.detach().cpu().numpy()
ln1_gamma = block.layer[1].layer_norm.weight.detach().cpu().numpy()
ln2_gamma = block.layer[2].layer_norm.weight.detach().cpu().numpy()

ln0_beta = np.zeros_like(ln0_gamma)
ln1_beta = np.zeros_like(ln1_gamma)
ln2_beta = np.zeros_like(ln2_gamma)

num_heads = 8
d_model = 512
head_dim = d_model // num_heads

def split_heads(x):
    batch_size, seq_len, d_model = x.shape
    x = x.reshape(batch_size, seq_len, num_heads, head_dim)
    return np.transpose(x, (0, 2, 1, 3))

def scaled_dot_product_attention(Q, K, V, relative_bias=None):
    d_k = Q.shape[-1]
    scores = np.matmul(Q, K.T) / np.sqrt(d_k)
    if relative_bias is not None:
        scores += relative_bias
    scores_exp = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
    attention = scores_exp / np.sum(scores_exp, axis=-1, keepdims=True)
    output = np.matmul(attention, V)
    return output

def multi_head_attention(Q_input, K_input, V_input, Wq, Wk, Wv, Wo, relative_bias=None):
    Q = Q_input @ Wq
    K = K_input @ Wk
    V = V_input @ Wv

    Q = split_heads(Q)
    K = split_heads(K)
    V = split_heads(V)

    heads_output = scaled_dot_product_attention(Q, K, V, relative_bias)
    heads_output = heads_output.transpose(0, 2, 1, 3).reshape(Q_input.shape[0], Q_input.shape[1], d_model)
    output = heads_output @ Wo
    return output

def t5_layer_norm(x, gamma, beta, eps=1e-6):
    rms = np.sqrt(np.mean(x ** 2, axis=-1, keepdims=True) + eps)
    norm_x = x / rms
    return norm_x * gamma + beta

def feed_forward(x, W1, b1, W2, b2):
    hidden = np.maximum(0, x @ W1 + b1)
    output = hidden @ W2 + b2
    return output

def decoder_block_forward(X, encoder_output, params):
    (Wq_self, Wk_self, Wv_self, Wo_self,
        Wq_cross, Wk_cross, Wv_cross, Wo_cross,
        W1, b1, W2, b2,
        ln0_gamma, ln0_beta, ln1_gamma, ln1_beta, ln2_gamma, ln2_beta) = params

    normed1 = t5_layer_norm(X, ln0_gamma, ln0_beta)
    attn_self_out = multi_head_attention(normed1, normed1, normed1, Wq_self, Wk_self, Wv_self, Wo_self, mask)
    res1 = X + attn_self_out

    normed2 = t5_layer_norm(res1, ln1_gamma, ln1_beta)
    attn_cross_out = multi_head_attention(normed2, encoder_output, encoder_output, Wq_cross, Wk_cross, Wv_cross, Wo_cross)
    res2 = res1 + attn_cross_out

    normed3 = t5_layer_norm(res2, ln2_gamma, ln2_beta)
    ff_out = feed_forward(normed3, W1, b1, W2, b2)
    output = res2 + ff_out
    return output

encoder_output_np = encoder_final_output
decoder_input_np = decoder_embeddings.squeeze(0).detach().cpu().numpy()[np.newaxis, :, :]

params = (
    Wq_self, Wk_self, Wv_self, Wo_self,
    Wq_cross, Wk_cross, Wv_cross, Wo_cross,
    W1, b1, W2, b2,
    ln0_gamma, ln0_beta, ln1_gamma, ln1_beta, ln2_gamma, ln2_beta
)

decoder_output1 = decoder_block_forward(decoder_input_np, encoder_output_np, params)


In [57]:
decoder_hidden_states[1]

tensor([[[ 43.7896, -23.3442,  70.8558,  ..., -59.1159,  60.4994,  23.8016],
         [  8.6981, -23.9141,  -0.8469,  ..., -45.8692, -16.9217,  38.2776],
         [ -7.1065, -12.8938, -18.0233,  ...,  72.7277,  18.9190,  45.6754],
         [-10.9525, -20.6448,  73.2725,  ..., -27.9529,  -6.0691, -29.7943],
         [ 22.5672, -60.9759,  87.0815,  ..., -14.8507,  70.1290,  34.2144]]])

In [None]:
decoder_output1

array([[[ 43.609566 , -23.524185 ,  70.675804 , ..., -59.295883 ,
          60.319366 ,  23.621647 ],
        [  8.518104 , -24.094147 ,  -1.026902 , ..., -46.049194 ,
         -17.101719 ,  38.097637 ],
        [ -7.2864866, -13.073762 , -18.203321 , ...,  72.5477   ,
          18.738974 ,  45.495384 ],
        [-11.13254  , -20.82483  ,  73.092514 , ..., -28.132927 ,
          -6.2490764, -29.97431  ],
        [ 22.387188 , -61.155884 ,  86.90145  , ..., -15.030689 ,
          69.94899  ,  34.034374 ]]], dtype=float32)

In [None]:
temp = decoder_output1

for i in range(1, 6):

    block = model.decoder.block[i]
    
    Wq_self = block.layer[0].SelfAttention.q.weight.detach().cpu().numpy().T
    Wk_self = block.layer[0].SelfAttention.k.weight.detach().cpu().numpy().T
    Wv_self = block.layer[0].SelfAttention.v.weight.detach().cpu().numpy().T
    Wo_self = block.layer[0].SelfAttention.o.weight.detach().cpu().numpy().T

    Wq_cross = block.layer[1].EncDecAttention.q.weight.detach().cpu().numpy().T
    Wk_cross = block.layer[1].EncDecAttention.k.weight.detach().cpu().numpy().T
    Wv_cross = block.layer[1].EncDecAttention.v.weight.detach().cpu().numpy().T
    Wo_cross = block.layer[1].EncDecAttention.o.weight.detach().cpu().numpy().T

    W1 = block.layer[2].DenseReluDense.wi.weight.detach().cpu().numpy().T
    b1 = block.layer[2].DenseReluDense.wi.bias
    if b1 is not None:
        b1 = b1.detach().cpu().numpy()
    else:
        b1 = np.zeros(W1.shape[1])

    W2 = block.layer[2].DenseReluDense.wo.weight.detach().cpu().numpy().T
    b2 = block.layer[2].DenseReluDense.wo.bias
    if b2 is not None:
        b2 = b2.detach().cpu().numpy()
    else:
        b2 = np.zeros(W2.shape[1])

    ln0_gamma = block.layer[0].layer_norm.weight.detach().cpu().numpy()
    ln1_gamma = block.layer[1].layer_norm.weight.detach().cpu().numpy()
    ln2_gamma = block.layer[2].layer_norm.weight.detach().cpu().numpy()

    ln0_beta = np.zeros_like(ln0_gamma)
    ln1_beta = np.zeros_like(ln1_gamma)
    ln2_beta = np.zeros_like(ln2_gamma)

    num_heads = 8
    d_model = 512
    head_dim = d_model // num_heads

    def split_heads(x):
        batch_size, seq_len, d_model = x.shape
        x = x.reshape(batch_size, seq_len, num_heads, head_dim)
        return np.transpose(x, (0, 2, 1, 3))

    def scaled_dot_product_attention(Q, K, V, relative_bias=None):
        d_k = Q.shape[-1]
        scores = np.matmul(Q, K.transpose(0, 1, 3, 2)) / np.sqrt(d_k)
        if relative_bias is not None:
            scores += relative_bias
        scores_exp = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
        attention = scores_exp / np.sum(scores_exp, axis=-1, keepdims=True)
        output = np.matmul(attention, V)
        return output

    def multi_head_attention(Q_input, K_input, V_input, Wq, Wk, Wv, Wo, relative_bias=None):
        Q = Q_input @ Wq
        K = K_input @ Wk
        V = V_input @ Wv

        Q = split_heads(Q)
        K = split_heads(K)
        V = split_heads(V)

        heads_output = scaled_dot_product_attention(Q, K, V, relative_bias)
        heads_output = heads_output.transpose(0, 2, 1, 3).reshape(Q_input.shape[0], Q_input.shape[1], d_model)
        output = heads_output @ Wo
        return output

    def t5_layer_norm(x, gamma, beta, eps=1e-6):
        rms = np.sqrt(np.mean(x ** 2, axis=-1, keepdims=True) + eps)
        norm_x = x / rms
        return norm_x * gamma + beta

    def feed_forward(x, W1, b1, W2, b2):
        hidden = np.maximum(0, x @ W1 + b1)
        output = hidden @ W2 + b2
        return output

    def decoder_block_forward(X, encoder_output, params):
        (Wq_self, Wk_self, Wv_self, Wo_self,
         Wq_cross, Wk_cross, Wv_cross, Wo_cross,
         W1, b1, W2, b2,
         ln0_gamma, ln0_beta, ln1_gamma, ln1_beta, ln2_gamma, ln2_beta) = params

        normed1 = t5_layer_norm(X, ln0_gamma, ln0_beta)
        attn_self_out = multi_head_attention(normed1, normed1, normed1, Wq_self, Wk_self, Wv_self, Wo_self)
        res1 = X + attn_self_out

        normed2 = t5_layer_norm(res1, ln1_gamma, ln1_beta)
        attn_cross_out = multi_head_attention(normed2, encoder_output, encoder_output, Wq_cross, Wk_cross, Wv_cross, Wo_cross)
        res2 = res1 + attn_cross_out

        normed3 = t5_layer_norm(res2, ln2_gamma, ln2_beta)
        ff_out = feed_forward(normed3, W1, b1, W2, b2)
        output = res2 + ff_out
        return output

    encoder_output_np = encoder_final_output
    decoder_input_np = temp

    params = (
        Wq_self, Wk_self, Wv_self, Wo_self,
        Wq_cross, Wk_cross, Wv_cross, Wo_cross,
        W1, b1, W2, b2,
        ln0_gamma, ln0_beta, ln1_gamma, ln1_beta, ln2_gamma, ln2_beta
    )
    previous_output = temp
    decoder_output_np = decoder_block_forward(decoder_input_np, encoder_output_np, params)
    temp = decoder_output_np


In [63]:
decoder_hidden_states[6]

tensor([[[ 1.3594e-01,  1.1107e-01, -4.6780e-02,  ..., -2.9845e-02,
           7.2804e-04,  3.8300e-02],
         [ 1.2243e-01,  1.6065e-01, -7.5456e-02,  ..., -3.2210e-02,
           4.4335e-04,  2.3429e-01],
         [-4.2277e-02,  2.1329e-01,  3.7140e-02,  ...,  4.5189e-03,
           3.8768e-04,  1.7418e-01],
         [ 1.1096e-01,  2.8968e-01, -3.5625e-02,  ...,  3.9716e-02,
           7.4936e-06,  8.3287e-02],
         [ 1.1195e-01,  1.4506e-01,  1.0306e-02,  ..., -9.8582e-03,
           2.1646e-04,  1.9090e-01]]])

In [None]:
decoder_output_np

array([[[ 0.03593772,  0.01106936, -0.14678048, ..., -0.1298446 ,
         -0.09927196, -0.06169962],
        [ 0.0224255 ,  0.06065487, -0.1754556 , ..., -0.13220973,
         -0.09955665,  0.13429067],
        [-0.14227688,  0.11329357, -0.0628598 , ..., -0.09548108,
         -0.09961233,  0.07417569],
        [ 0.01095598,  0.18968216, -0.13562532, ..., -0.06028403,
         -0.09999251, -0.01671267],
        [ 0.01194875,  0.0450618 , -0.08969433, ..., -0.10985817,
         -0.09978354,  0.09089697]]], dtype=float32)

In [None]:
embedding_matrix = model.shared.weight.detach().cpu().numpy()
logits_np = np.matmul(decoder_output_np, embedding_matrix.T)

In [None]:
predicted_token_ids = np.argmax(logits_np, axis=-1)
predicted_token_ids


array([[15924,  5049, 27751,     5, 14652]], dtype=int64)

In [82]:
from transformers import T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained('t5-small')
predicted_tokens = tokenizer.batch_decode(predicted_token_ids, skip_special_tokens=True)
print(predicted_tokens)


['Mason Mel magnif. pools']
