# 2023-01-30 self attention layers

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
import numpy as np

In [2]:
B, T, C = 32, 8, 32 ; # batch, time, channel
n_embd = 32
batch_size, block_size = B, T
max_iter = 3000
eval_iters = 200
eval_interval = 300
head_size = 16

In [3]:
torch.cuda.device_count(), torch.cuda.current_device()

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# read on review data
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
# Here is all unique character that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)

# create a mapping from characters to integers
ctoi = {c: i for i, c in enumerate(chars)}
itoc = {i: c for i, c in enumerate(chars)}
encode = lambda s: [ctoi[c] for c in s]
decode = lambda l: ''.join([itoc[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
# Let's now split up the data into train set and validation set
n = round(len(data) * 0.9);
train_data = data[:n]
val_data   = data[n:]

In [4]:
torch.manual_seed(1337)

def get_batch(split='train'):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(train_data) - block_size, (batch_size, ))
    xb = torch.stack([train_data[i:i+block_size] for i in ix])
    yb = torch.stack([train_data[i+1:i+1+block_size] for i in ix])
    xb, yb = xb.to(device), yb.to(device)
    return xb, yb

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.query = nn.Linear(C, head_size)
        self.key = nn.Linear(C, head_size)
        self.value = nn.Linear(C, head_size)
        self.register_buffer('tril', torch.tril(torch.ones(T, T)))
    def forward(self, x):
        q = self.query(x) # B, T, head_size
        k = self.key(x) # B, T, head_size
        v = self.value(x) # B, T, head_size
        # computer attention score
        wei = q @ v.transpose(-2, -1) * head_size ** -0.5 # (B, T, head_size) x (B, head_size, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril == 0, float('-inf'))
        wei = torch.softmax(wei,dim=-1) # (B, T, T)
        # perform the weighted aggregation
        out = wei@v # (B, T, T) x (B, T, head_size) -> (B, T, head_size)
        return out
    
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(T, n_embd)
        self.sa_head = Head(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T).to(device))
        x = tok_emb + pos_emb # (B, T, n_embd)
        x = self.sa_head(x) # apply one head attention
        logits = self.lm_head(x) # (B, n_embd, vocab_size) x (B, T, n_embd) -> (B, T, vocab_size)
        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        else:
            loss = None
        return loss, logits

    @torch.no_grad()
    def generate(self, idx, max_new_token):
        for i in range(max_new_token):
            loss, logits = model(idx[:, -block_size:])
            logits = logits[:, -1,:]
            probs = F.softmax(logits, -1)
            next_idx = torch.multinomial(probs, 1)
            idx = torch.cat([idx, next_idx], 1)
        return idx
    
@torch.no_grad()
def estimate_loss():
    model.eval()
    losses = torch.zeros(eval_iters)
    out = {}
    for split in ['train', 'val']:
        for i in range(eval_iters):
            xb, yb = get_batch()
            loss, logits = model(xb, yb)
            losses[i] = loss
        out[split] = losses.mean().item()
    return out

def train():
    for i in range(max_iter):
        if i % eval_iters == 0:
            out = estimate_loss()
            print(f"Train loss: {out['train']}. Val loss: {out['val']}. ")
        xb, yb = get_batch()
        loss, logits = model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()

In [5]:
model = BigramLanguageModel(vocab_size).to(device)
m = model.to(device)
optimizer = AdamW(model.parameters())

In [6]:
# sent = model.generate(idx=torch.zeros((1,1), dtype=torch.long), max_new_token=1000)
# print(decode(sent[0].tolist()))

In [7]:
train()
# sent = model.generate(idx=torch.zeros((1,1), dtype=torch.long), max_new_token=500)
# print(decode(sent[0].tolist()))

Train loss: 4.209799766540527. Val loss: 4.210343837738037. 
Train loss: 3.0118324756622314. Val loss: 3.029925584793091. 
Train loss: 2.7751874923706055. Val loss: 2.7928709983825684. 
Train loss: 2.644948720932007. Val loss: 2.658383369445801. 
Train loss: 2.572192668914795. Val loss: 2.58717679977417. 
Train loss: 2.5385854244232178. Val loss: 2.527400493621826. 
Train loss: 2.5111818313598633. Val loss: 2.5001227855682373. 
Train loss: 2.49641752243042. Val loss: 2.503206729888916. 
Train loss: 2.4784927368164062. Val loss: 2.473785161972046. 
Train loss: 2.4662117958068848. Val loss: 2.453801155090332. 
Train loss: 2.4490010738372803. Val loss: 2.4530270099639893. 
Train loss: 2.4463274478912354. Val loss: 2.452366352081299. 
Train loss: 2.440185308456421. Val loss: 2.4310648441314697. 
Train loss: 2.439107656478882. Val loss: 2.4313089847564697. 
Train loss: 2.434870958328247. Val loss: 2.43583345413208. 


In [8]:
train()
# sent = model.generate(idx=torch.zeros((1,1), dtype=torch.long), max_new_token=500)
# print(decode(sent[0].tolist()))

Train loss: 2.427349805831909. Val loss: 2.421347141265869. 
Train loss: 2.414802074432373. Val loss: 2.4105308055877686. 
Train loss: 2.4139771461486816. Val loss: 2.408018112182617. 
Train loss: 2.4174203872680664. Val loss: 2.40922474861145. 
Train loss: 2.4164133071899414. Val loss: 2.4019827842712402. 
Train loss: 2.412034034729004. Val loss: 2.4040515422821045. 
Train loss: 2.4022231101989746. Val loss: 2.3965790271759033. 
Train loss: 2.407172203063965. Val loss: 2.3981354236602783. 
Train loss: 2.3984341621398926. Val loss: 2.4028024673461914. 
Train loss: 2.3976922035217285. Val loss: 2.3967223167419434. 
Train loss: 2.394608736038208. Val loss: 2.399974822998047. 
Train loss: 2.393988847732544. Val loss: 2.3962814807891846. 
Train loss: 2.38093900680542. Val loss: 2.3974993228912354. 
Train loss: 2.3883285522460938. Val loss: 2.390153169631958. 
Train loss: 2.3956613540649414. Val loss: 2.3789026737213135. 


### Convert this file to md

In [9]:
from IPython.core.display import Javascript

In [12]:
%%js
IPython.notebook.kernel.execute('this_notebook = "' + IPython.notebook.notebook_name + '"')

<IPython.core.display.Javascript object>

In [13]:
this_notebook

'2023-01-30-attention-layer-compact.ipynb'

In [14]:
!jupyter nbconvert --to markdown {this_notebook} --output-dir=../_posts

[NbConvertApp] Converting notebook 2023-01-30-attention-layer-compact.ipynb to markdown
[NbConvertApp] Writing 7526 bytes to ../_posts/2023-01-30-attention-layer-compact.md
