# 2023-01-30 self attention layers

In [1]:
# download data

In [2]:
import torch
torch.cuda.device_count(), torch.cuda.current_device()

(1, 0)

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [4]:
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [5]:
# read on review data
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [6]:
# Here is all unique character that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)

In [7]:
# create a mapping from characters to integers
ctoi = {c: i for i, c in enumerate(chars)}
itoc = {i: c for i, c in enumerate(chars)}

encode = lambda s: [ctoi[c] for c in s]
decode = lambda l: ''.join([itoc[i] for i in l])

print(encode("hi there"))

print(decode(encode("hi there")))

[46, 47, 1, 58, 46, 43, 56, 43]
hi there


In [8]:
# Let now encode the entire text dataset and store it into torch.Tensor
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)

data[0:100]

torch.Size([1115394]) torch.int64


tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])

In [9]:
# Let's now split up the data into train set and validation set
n = round(len(data) * 0.9);
train_data = data[:n]
val_data   = data[n:]
len(train_data), len(val_data)

(1003855, 111539)

In [10]:
block_size = 8
train_data[:block_size]

tensor([18, 47, 56, 57, 58,  1, 15, 47])

In [11]:
x = train_data[:block_size]
y = train_data[1:block_size + 1]

for i in range(block_size):
    context = x[:i+1]
    target = y[i]
    print(f"when context is {context} the target: {target}.")

when context is tensor([18]) the target: 47.
when context is tensor([18, 47]) the target: 56.
when context is tensor([18, 47, 56]) the target: 57.
when context is tensor([18, 47, 56, 57]) the target: 58.
when context is tensor([18, 47, 56, 57, 58]) the target: 1.
when context is tensor([18, 47, 56, 57, 58,  1]) the target: 15.
when context is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47.
when context is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58.


In [12]:
torch.manual_seed(1337)

batch_size = 4
block_size = 8

def get_batch(split='train'):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(train_data) - block_size, (batch_size, ))
    xb = torch.stack([train_data[i:i+block_size] for i in ix])
    yb = torch.stack([train_data[i+1:i+1+block_size] for i in ix])
    xb, yb = xb.to(device), yb.to(device)
    return xb, yb

xb, yb = get_batch()
print(xb)
print(yb)
for b in range(batch_size):
    for i in range(block_size):
        x = xb[b,:i+1]
        y = yb[b,i]
        print(f"when context is {x} the target: {y}.")

tensor([[56,  6,  0, 24, 43, 58,  1, 61],
        [39, 47, 51,  1, 58, 46, 39, 58],
        [52, 45,  1, 58, 53,  1, 57, 39],
        [43, 47, 52, 45,  1, 46, 53, 50]], device='cuda:0')
tensor([[ 6,  0, 24, 43, 58,  1, 61, 46],
        [47, 51,  1, 58, 46, 39, 58,  1],
        [45,  1, 58, 53,  1, 57, 39, 63],
        [47, 52, 45,  1, 46, 53, 50, 47]], device='cuda:0')
when context is tensor([56], device='cuda:0') the target: 6.
when context is tensor([56,  6], device='cuda:0') the target: 0.
when context is tensor([56,  6,  0], device='cuda:0') the target: 24.
when context is tensor([56,  6,  0, 24], device='cuda:0') the target: 43.
when context is tensor([56,  6,  0, 24, 43], device='cuda:0') the target: 58.
when context is tensor([56,  6,  0, 24, 43, 58], device='cuda:0') the target: 1.
when context is tensor([56,  6,  0, 24, 43, 58,  1], device='cuda:0') the target: 61.
when context is tensor([56,  6,  0, 24, 43, 58,  1, 61], device='cuda:0') the target: 46.
when context is tensor(

In [13]:
print(xb) # input to our transformer

tensor([[56,  6,  0, 24, 43, 58,  1, 61],
        [39, 47, 51,  1, 58, 46, 39, 58],
        [52, 45,  1, 58, 53,  1, 57, 39],
        [43, 47, 52, 45,  1, 46, 53, 50]], device='cuda:0')


In [14]:
# self attention layer

In [15]:
import torch

torch.manual_seed(1338)

B, T, C = 4, 8, 2 ; # batch, time, channel

x = torch.randn((B, T, C))

x[0]

tensor([[-1.3113, -1.0017],
        [-1.2342,  0.1297],
        [-0.5150, -1.2666],
        [-0.6719,  0.1851],
        [ 0.9367,  0.3139],
        [-1.3950,  0.1132],
        [ 0.3622,  2.5192],
        [-0.7672, -0.9529]])

In [16]:
# version 1
xbow = torch.zeros_like(x)
for b in range(B):
    for t in range(T):
        x_prev = x[b, :t+1] # t, C
        xbow[b,t] = torch.mean(x_prev, dim=0)
xbow[0]

tensor([[-1.3113, -1.0017],
        [-1.2728, -0.4360],
        [-1.0202, -0.7129],
        [-0.9331, -0.4884],
        [-0.5591, -0.3279],
        [-0.6985, -0.2544],
        [-0.5469,  0.1418],
        [-0.5745,  0.0050]])

In [17]:
# version 2
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(dim=1, keepdim=True)
xbow2 = wei @ x # (B, T, T) x (B, T, C) -> (B, T, C)
torch.allclose(xbow, xbow2)

True

In [18]:
# version 3
# using solfmax
wei = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(wei==0, -torch.inf)
wei = torch.softmax(wei, dim=-1)
xbow3 = wei @ x # (B, T, T) x (B, T, C) -> (B, T, C)
torch.allclose(xbow, xbow3)

True

In [19]:
import torch
import torch.nn as nn

torch.manual_seed(1337)

B, T, C = 4, 8, 32 ; # batch, time, channel
head_size = 16

x = torch.randn((B, T, C))

query = nn.Linear(C, head_size, bias=False)
key = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

q = query(x) # (B, T, head_size)
k = key(x) # (B, T, head_size)
v = value(x) # (B, T, head_size)

wei = q@k.transpose(-2, -1) # (B, T, head_size) @ (B, head_size, head_size, T) -> (B, T, T)
wei = wei / torch.sqrt(torch.tensor(head_size))
tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril==0, -torch.inf)
wei = torch.softmax(wei, dim=-1)

out = wei @ v # (B, T, T) x (B, T, C) -> (B, T, C)

q.shape, k.shape, v.shape, wei.shape, out.shape

(torch.Size([4, 8, 16]),
 torch.Size([4, 8, 16]),
 torch.Size([4, 8, 16]),
 torch.Size([4, 8, 8]),
 torch.Size([4, 8, 16]))

In [20]:
# create a class for self attention layer
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.query = nn.Linear(C, head_size)
        self.key = nn.Linear(C, head_size)
        self.value = nn.Linear(C, head_size)
        self.register_buffer('tril', torch.tril(torch.ones(T, T)))
    def forward(self, x):
        q = self.query(x) # B, T, head_size
        k = self.key(x) # B, T, head_size
        v = self.value(x) # B, T, head_size
        # computer attention score
        wei = q @ v.transpose(-2, -1) * head_size ** -0.5 # (B, T, head_size) x (B, head_size, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril == 0, float('-inf'))
        wei = torch.softmax(wei,dim=-1) # (B, T, T)
        # perform the weighted aggregation
        out = wei@v # (B, T, T) x (B, T, head_size) -> (B, T, head_size)
        return out
    
head = Head(2)
out = head(x); out.shape

torch.Size([4, 8, 2])

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F

torch.manual_seed(1337)

# bigram language model
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(T, n_embd)
        self.sa_head = Head(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T).to(device))
        x = tok_emb + pos_emb # (B, T, n_embd)
        x = self.sa_head(x) # apply one head attention
        logits = self.lm_head(x) # (B, n_embd, vocab_size) x (B, T, n_embd) -> (B, T, vocab_size)
        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
        return loss, logits

    @torch.no_grad()
    def generate(self, idx, max_new_token):
        for i in range(max_new_token):
            loss, logits = model(idx[:, -block_size:])
            logits = logits[:, -1,:]
            probs = F.softmax(logits, -1)
            next_idx = torch.multinomial(probs, 1)
            idx = torch.cat([idx, next_idx], 1)
        return idx
    
B, T, C = 32, 8, 32 ; # batch, time, channel
n_embd = 32
batch_size, block_size = B, T
max_iter = 3000
eval_iters = 200
eval_interval = 300

head_size = 16
xb, yb = get_batch()
print(xb.shape, yb.shape)

model = BigramLanguageModel(vocab_size).to(device)
m = model.to(device)

loss, logits = model(xb, yb)
print(loss)

torch.Size([32, 8]) torch.Size([32, 8])
tensor(4.2236, device='cuda:0', grad_fn=<NllLossBackward0>)


In [22]:
@torch.no_grad()
def estimate_loss():
    model.eval()
    losses = torch.zeros(eval_iters)
    out = {}
    for split in ['train', 'val']:
        for i in range(eval_iters):
            xb, yb = get_batch()
            loss, logits = model(xb, yb)
            losses[i] = loss
        out[split] = losses.mean().item()
    return out

In [23]:
# sent = model.generate(idx=torch.zeros((1,1), dtype=torch.long), max_new_token=1000)
# print(decode(sent[0].tolist()))

In [24]:
# create pytorch optimizer
from torch.optim import AdamW
import numpy as np


optimizer = AdamW(model.parameters())

In [25]:
# do optimize
def train():
    for i in range(max_iter):
        if i % eval_iters == 0:
            out = estimate_loss()
            print(f"Train loss: {out['train']}. Val loss: {out['val']}. ")
        xb, yb = get_batch()
        loss, logits = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

train()

# sent = model.generate(idx=torch.zeros((1,1), dtype=torch.long), max_new_token=500)
# print(decode(sent[0].tolist()))

Train loss: 4.250394344329834. Val loss: 4.251773834228516. 
Train loss: 3.0892834663391113. Val loss: 3.1048145294189453. 
Train loss: 2.921942949295044. Val loss: 2.9380691051483154. 
Train loss: 2.74647855758667. Val loss: 2.7568492889404297. 
Train loss: 2.6170177459716797. Val loss: 2.62808895111084. 
Train loss: 2.5652170181274414. Val loss: 2.5561227798461914. 
Train loss: 2.5313711166381836. Val loss: 2.5239756107330322. 
Train loss: 2.5157248973846436. Val loss: 2.5192513465881348. 
Train loss: 2.4950547218322754. Val loss: 2.485825538635254. 
Train loss: 2.475869655609131. Val loss: 2.4657135009765625. 
Train loss: 2.456251859664917. Val loss: 2.4611053466796875. 
Train loss: 2.4570863246917725. Val loss: 2.4615395069122314. 
Train loss: 2.4492809772491455. Val loss: 2.438823938369751. 
Train loss: 2.4471447467803955. Val loss: 2.4392707347869873. 
Train loss: 2.448664903640747. Val loss: 2.447890043258667. 


In [26]:
train()
    
# sent = model.generate(idx=torch.zeros((1,1), dtype=torch.long), max_new_token=500)
# print(decode(sent[0].tolist()))

Train loss: 2.435600996017456. Val loss: 2.430391550064087. 
Train loss: 2.4216527938842773. Val loss: 2.4209277629852295. 
Train loss: 2.422429323196411. Val loss: 2.418130874633789. 
Train loss: 2.4247641563415527. Val loss: 2.4178075790405273. 
Train loss: 2.4253361225128174. Val loss: 2.4130218029022217. 
Train loss: 2.4201459884643555. Val loss: 2.4103615283966064. 
Train loss: 2.4079363346099854. Val loss: 2.402588129043579. 
Train loss: 2.4161858558654785. Val loss: 2.408769369125366. 
Train loss: 2.4048120975494385. Val loss: 2.407437801361084. 
Train loss: 2.404520034790039. Val loss: 2.402391195297241. 
Train loss: 2.3978495597839355. Val loss: 2.4051241874694824. 
Train loss: 2.3947062492370605. Val loss: 2.393450975418091. 
Train loss: 2.383281707763672. Val loss: 2.397740125656128. 
Train loss: 2.3895959854125977. Val loss: 2.3915750980377197. 
Train loss: 2.3961288928985596. Val loss: 2.376368522644043. 


### Convert this file to md

In [27]:
from IPython.core.display import Javascript

In [31]:
%%js
IPython.notebook.kernel.execute('this_notebook = "' + IPython.notebook.notebook_name + '"')

<IPython.core.display.Javascript object>

In [32]:
this_notebook

'2023-01-30-attention-layer.ipynb'

In [33]:
!jupyter nbconvert --to markdown {this_notebook} --output-dir=../_posts

[NbConvertApp] Converting notebook 2023-01-30-attention-layer.ipynb to markdown
[NbConvertApp] Writing 15334 bytes to ../_posts/2023-01-30-attention-layer.md
