In [1]:
import math

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset
import numpy as np
import matplotlib.pyplot as plt

In [3]:
m = nn.Linear(3, 5)
input = torch.randn(30, 3)
output = m(input)
print(output.size())

torch.Size([30, 5])


In [4]:
print(m.weight)
print(m.weight.shape)

Parameter containing:
tensor([[ 0.2815,  0.4312,  0.2928],
        [ 0.4351, -0.4102,  0.5231],
        [ 0.4547,  0.0486, -0.2896],
        [ 0.2401,  0.3523,  0.1633],
        [ 0.4645, -0.4825,  0.5104]], requires_grad=True)
torch.Size([5, 3])


In [5]:
m = nn.Linear(3, 2)
input = torch.randn(5, 3)
output = m(input)
print(output.size())

torch.Size([5, 2])


In [6]:
print(output)

tensor([[ 0.7604,  0.7423],
        [ 1.1111,  0.3386],
        [ 0.9002, -0.5376],
        [ 0.5250, -0.3110],
        [ 0.0945,  0.5233]], grad_fn=<AddmmBackward0>)


In [7]:
print(input)

tensor([[ 1.2168,  0.5998, -0.7084],
        [ 1.3243,  0.3429,  0.4462],
        [-0.1091, -0.4260,  1.4350],
        [-0.4175, -0.4694,  0.2484],
        [-0.6214,  1.3768,  0.5092]])


In [8]:
print(m.weight)
print(m.weight.shape)

Parameter containing:
tensor([[ 0.4136, -0.2181,  0.2167],
        [ 0.1657,  0.5059, -0.2525]], requires_grad=True)
torch.Size([2, 3])


In [9]:
print(m.bias)

Parameter containing:
tensor([0.5415, 0.0583], requires_grad=True)


In [49]:
#different Dimension 3D vs 2d weight
m = nn.Linear(3, 2)
input = torch.randn(2, 5, 3)
output = m(input)
print(output.size())

torch.Size([2, 5, 2])


In [50]:
print(input[0])
print(m.weight)
print(m.bias)
print(output[0])

tensor([[-0.4020,  0.9819, -0.6262],
        [ 1.0931,  0.1125,  1.3712],
        [-1.0081,  2.2841,  0.8125],
        [ 0.1829,  1.9974,  0.6805],
        [-0.9521, -0.3974, -1.5116]])
Parameter containing:
tensor([[-0.3362, -0.5557,  0.1012],
        [ 0.0342, -0.1919, -0.5346]], requires_grad=True)
Parameter containing:
tensor([-0.5310,  0.4198], requires_grad=True)
tensor([[-1.0048,  0.5524],
        [-0.8223, -0.2975],
        [-1.3791, -0.4873],
        [-1.6336, -0.3210],
        [-0.1431,  1.2716]], grad_fn=<SelectBackward0>)


In [51]:
print(input[1])
print(m.weight)
print(m.bias)
print(output[1])

tensor([[-3.1956,  0.3942, -0.1056],
        [ 2.4017, -1.2683, -1.5703],
        [ 0.0201,  0.9788,  1.0141],
        [-1.2786,  0.6511,  0.2436],
        [ 0.2166, -2.0369, -0.4048]])
Parameter containing:
tensor([[-0.3362, -0.5557,  0.1012],
        [ 0.0342, -0.1919, -0.5346]], requires_grad=True)
Parameter containing:
tensor([-0.5310,  0.4198], requires_grad=True)
tensor([[ 0.3136,  0.2914],
        [-0.7926,  1.5847],
        [-0.9790, -0.3095],
        [-0.4383,  0.1209],
        [ 0.4871,  1.0345]], grad_fn=<SelectBackward0>)


In [109]:
#adding diff size tensor
input_a = torch.randn(8, 512, 64)
input_b = torch.randn(1, 512, 64)

input_c = input_a + input_b

In [110]:
input_c.shape

torch.Size([8, 512, 64])

In [111]:
input_a = torch.randn(5, 3)
input_b = torch.randn(1, 3)
input_c = input_a + input_b
print(input_a)
print(input_b)
print(input_c)

tensor([[ 1.0551,  0.7041,  1.5120],
        [-0.4691, -0.2462, -0.0187],
        [ 0.7619, -0.5605, -0.2905],
        [ 0.7746,  0.8708,  0.7130],
        [ 0.3988,  0.5639,  0.4560]])
tensor([[ 1.9520, -0.3821, -0.5042]])
tensor([[ 3.0070,  0.3219,  1.0078],
        [ 1.4829, -0.6284, -0.5229],
        [ 2.7138, -0.9427, -0.7948],
        [ 2.7265,  0.4887,  0.2088],
        [ 2.3508,  0.1818, -0.0482]])


In [112]:
#softmax attention 2D
input_attn = torch.randn(5, 3)
output_softmax = F.softmax(input_attn, dim= -1)

In [113]:
input_attn

tensor([[-0.4420, -1.3317, -0.3135],
        [ 1.6815, -1.8824,  0.3285],
        [-1.0429,  2.1300, -0.8862],
        [ 0.0844,  0.0209, -0.6018],
        [-0.3678, -0.8172,  1.1517]])

In [114]:
output_softmax

tensor([[0.3925, 0.1612, 0.4463],
        [0.7771, 0.0220, 0.2009],
        [0.0384, 0.9167, 0.0449],
        [0.4095, 0.3843, 0.2062],
        [0.1611, 0.1028, 0.7361]])

In [115]:
#softmax attention 3D
input_attn = torch.randn(2,5, 3)
output_softmax = F.softmax(input_attn, dim= -1)

In [116]:
input_attn[0]

tensor([[ 0.4619,  0.3188,  2.3744],
        [ 0.3249, -0.6101, -1.1520],
        [-0.2039, -0.7993, -2.5554],
        [-1.8620, -1.0116,  0.0179],
        [ 0.2838, -1.0790, -0.7506]])

In [117]:
output_softmax[0]

tensor([[0.1158, 0.1004, 0.7839],
        [0.6169, 0.2422, 0.1409],
        [0.6073, 0.3348, 0.0578],
        [0.1011, 0.2366, 0.6623],
        [0.6206, 0.1588, 0.2206]])

In [118]:
input_attn[1]

tensor([[ 1.6664,  1.1483,  0.8893],
        [ 0.6653, -1.2694, -0.0069],
        [-0.4746, -1.0764,  0.9368],
        [-0.3177, -0.4273, -0.2695],
        [ 1.1880,  0.8569, -0.4847]])

In [119]:
output_softmax[1]

tensor([[0.4865, 0.2898, 0.2237],
        [0.6042, 0.0873, 0.3085],
        [0.1770, 0.0970, 0.7260],
        [0.3395, 0.3043, 0.3563],
        [0.5247, 0.3768, 0.0985]])

In [76]:
class MultiHeadAttention(nn.Module):
    def __init__(self,d_k,d_model,n_heads):
        super().__init__()
        
        # Assume d_v = d_k
        self.d_k = d_k
        self.n_heads = n_heads
        
        self.key = nn.Linear(d_model,d_k * n_heads)
        self.query = nn.Linear(d_model, d_k * n_heads)
        self.value = nn.Linear(d_model, d_k * n_heads)
        
        #final linear layer
        self.fc = nn.Linear(d_k * n_heads, d_model)
        
    def forward(self, q, k, v, mask=None):

        #Note it's same for q,k,v dimension is same        
        #q before  and self.query : torch.Size([8, 512, 64]) torch.Size([64, 64])
        #q after : torch.Size([8, 512, 64])  
        #it's like  operate 8 time of different each [512,64] * [64,64] = [512,64]
           
        q = self.query(q) #N x T x (hd_k)      
        k = self.key(k) #N x T x (hd_k)
        v = self.value(v) #N x T x (hd_k)

        
        N = q.shape[0] #N=8
        T = q.shape[1] #T=512
        
        #change the shape to
        #(N,T,h,d_k) -> (N,h,T,d_k)
        #in order for matrix multiply works
        
        #n_heads = 4
        #self.d_k = 16
        #(N, T, self.n_heads, self.d_k) => (8,512,4,16)
        
        
        q = q.view(N, T, self.n_heads, self.d_k).transpose(1,2)
        #q,k,v after : [8, 4, 512, 16])  N,h,T,d_K
        
        k = k.view(N, T, self.n_heads, self.d_k).transpose(1,2)
        v = v.view(N, T, self.n_heads, self.d_k).transpose(1,2)
        
        # compute attention weights
        # (N,h,T,d_K) x (N,h,d_k,T) --> (N,h,T,T)
        

        #q [8, 4, 512, 16])  N,h,T,d_K
        #k.transpose(-2,-1) :',torch.Size([8, 4, 16, 512])
        #attn_scores [8, 4, 512, 512]
        attn_scores = q @ k.transpose(-2,-1) / math.sqrt(self.d_k)
        if mask is not None:
           attn_scores = attn_scores.masked_fill(mask[:, None, None, :] == 0, float('-inf'))
        
        #attn_weights [8, 4, 512, 512] same as attn_scores N,h,T,T
        attn_weights = F.softmax(attn_scores, dim= -1)
        
        
        #compute attention-weighted values
        #(N,h,T,T) * (N,h,T,d_k) -> (N,h,T,d_k)
        
        A = attn_weights @ v
        
        #reshape it back before final linear layer
        A = A.transpose(1,2) #(N,T,h,d_k)
        A = A.contiguous().view(N, T, self.d_k * self.n_heads) # (N,T,h*d_k)
        
        #A before is (8,512,4,16) and rearrange to A :[8, 512, 64]
        #projection
        return self.fc(A)
        
    
        
        

In [77]:
class TransformerBlock(nn.Module):
    def __init__(self,d_k, d_model, n_heads, dropout_prob=0.1):
        super().__init__()
        
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)
        self.mha = MultiHeadAttention(d_k,d_model,n_heads)
        self.ann = nn.Sequential(
            nn.Linear(d_model, d_model *4),
            nn.GELU(),
            nn.Linear(d_model * 4, d_model),
            nn.Dropout(dropout_prob),
        )
        self.dropout = nn.Dropout(p=dropout_prob)
        
    def forward(self,x, mask=None):
        #x [8, 512, 64], self.mha(x,x,x, mask) #attention block [8, 512, 64]
        
        x = self.ln1(x + self.mha(x,x,x, mask))
        x = self.ln2(x + self.ann(x))
        x = self.dropout(x)
        
        return x
    
        
        
        
        

In [101]:
class PositionalEncoding(nn.Module):
    def __init__(self,d_model, max_len=2048, dropout_prob=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout_prob)
        
        position = torch.arange(max_len).unsqueeze(1)       #position : torch.Size([1024, 1])
        exp_term = torch.arange(0,d_model,2)                #exp_term : torch.Size([32])
        div_term = torch.exp(exp_term * (-math.log(10000.0) / d_model))     #div_term : torch.Size([32]) 
        pe = torch.zeros(1, max_len, d_model)   #pe : torch.Size([1, 1024, 64])
        
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe',pe)
        
        
    def forward(self,x):
        # x.shape: N x T x D #torch.Size([8, 512, 64])
        #self.pe[:,:x.size(1), :]  torch.Size([1, 512, 64])
        #every row of data, has the same "additional" positional encoding
        x = x + self.pe[:,:x.size(1), :]
        #x after , still has size torch.Size([8, 512, 64])
        
        return self.dropout(x)
        
        
        

In [102]:
class Encoder(nn.Module):
    def __init__(self,
                 vocab_size, #20_000
                 max_len,    #1024
                 d_k,        #16
                 d_model,    #64
                 n_heads,    #4
                 n_layers,   #2
                 n_classes,  #5
                 dropout_prob): #0.1
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size,d_model)
        self.pos_encoding = PositionalEncoding(d_model, max_len, dropout_prob)
        transformer_blocks = [
            TransformerBlock(
                d_k,
                d_model,
                n_heads,
                dropout_prob) for _ in range(n_layers)]
        self.transformer_blocks = nn.Sequential(*transformer_blocks)
        self.ln = nn.LayerNorm(d_model)
        self.fc = nn.Linear(d_model, n_classes)
        
        
    def forward(self, x, mask= None):
        
        #x shape  torch.Size([8, 512])
        
        x = self.embedding(x)  #x shape after embed : torch.Size([8, 512, 64])
        x = self.pos_encoding(x)  #x shape after pos encode: torch.Size([8, 512, 64])

        #this loop 2 time, as in n_layers and each has same dimension
        #x tranf block iterate  torch.Size([8, 512, 64])
        for block in self.transformer_blocks:
            x = block(x,mask)

        
        #many to one (x has the shape N x T x D)
        x = x[:,0,:]
        x = self.ln(x)
        x = self.fc(x)
        return x
        
        

model = Encoder(20_000,1024,16,64,4,2,5,0.1)

In [103]:
model = Encoder(20_000,1024,16,64,4,2,5,0.1)

In [104]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)

cuda:0


Encoder(
  (embedding): Embedding(20000, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, 

In [105]:
x = np.random.randint(0,20_000,size=(8,512))
x_t = torch.tensor(x).to(device)

In [106]:
mask = np.ones((8,512))
mask[:,256:] = 0
mask_t = torch.tensor(mask).to(device)

In [107]:
y = model(x_t, mask_t)

xshape: torch.Size([8, 512, 64])
self.pe[:,:x.size(1), :]: torch.Size([1, 512, 64])
after xshape  torch.Size([8, 512, 64])


In [48]:
y

tensor([[ 0.5878,  0.5137,  0.3711, -0.1257,  0.4204],
        [ 1.3933, -1.0663, -0.0730, -0.1003, -0.1185],
        [ 0.3367,  0.1441,  0.2411, -0.8343,  0.8958],
        [-0.3690,  0.2260, -0.5430,  0.2643, -0.6687],
        [-0.4171,  0.4315, -0.0418,  0.7934,  0.0919],
        [ 0.5028,  0.5685, -0.0112,  0.3434,  0.9544],
        [ 0.5288,  0.5362, -0.0085,  1.0734, -0.1748],
        [ 0.2549,  1.0098, -0.0616, -0.6088,  0.7905]], device='cuda:0',
       grad_fn=<AddmmBackward0>)

In [49]:
#### LET"S DEBUG ONE BY ONE

In [50]:
class MultiHeadAttention_DEBUG_1(nn.Module):
    def __init__(self,d_k,d_model,n_heads):
        super().__init__()
        
        # Assume d_v = d_k
        self.d_k = d_k
        self.n_heads = n_heads
        
        self.key = nn.Linear(d_model,d_k * n_heads)
        self.query = nn.Linear(d_model, d_k * n_heads)
        self.value = nn.Linear(d_model, d_k * n_heads)
        
        #final linear layer
        self.fc = nn.Linear(d_k * n_heads, d_model)
        
    def forward(self, q, k, v, mask=None):
        q = self.query(q) #N x T x (hd_k)
        k = self.key(k) #N x T x (hd_k)
        v = self.value(v) #N x T x (hd_k)
        
        N = q.shape[0]
        T = q.shape[1]
        
        #change the shape to
        #(N,T,h,d_k) -> (N,h,T,d_k)
        #in order for matrix multiply works
        
        q = q.view(N, T, self.n_heads, self.d_k).transpose(1,2)
        k = k.view(N, T, self.n_heads, self.d_k).transpose(1,2)
        v = v.view(N, T, self.n_heads, self.d_k).transpose(1,2)
        
        # compute attention weights
        # (N,h,T,d_K) x (N,h,d_k,T) --> (N,h,T,T)
        
        attn_scores = q @ k.transpose(-2,-1) / math.sqrt(self.d_k)
        if mask is not None:
           attn_scores = attn_scores.masked_fill(mask[:, None, None, :] == 0, float('-inf'))
        
        attn_weights = F.softmax(attn_scores, dim= -1)
        
        #compute attention-weighted values
        #(N,h,T,T) * (N,h,T,d_k) -> (N,h,T,d_k)
        
        A = attn_weights @ v
        
        #reshape it back before final linear layer
        A = A.transpose(1,2) #(N,T,h,d_k)
        A = A.contiguous().view(N, T, self.d_k * self.n_heads) # (N,T,h*d_k)
        
        #projection
        return self.fc(A)

In [52]:
model_debug1 = MultiHeadAttention_DEBUG_1(16,64,4)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model_debug1.to(device)

cuda:0


MultiHeadAttention_DEBUG_1(
  (key): Linear(in_features=64, out_features=64, bias=True)
  (query): Linear(in_features=64, out_features=64, bias=True)
  (value): Linear(in_features=64, out_features=64, bias=True)
  (fc): Linear(in_features=64, out_features=64, bias=True)
)

In [55]:
x = np.random.randint(0,20_000,size=(8,512))
x_t = torch.tensor(x).to(device)
y = model_debug1(x_t,x_t,x_t)

RuntimeError: mat1 and mat2 must have the same dtype

In [80]:
test = nn.Linear(10, 20)

In [84]:
test.weight.shape

torch.Size([20, 10])