In [3]:
import torch
from torch.nn import Linear, GELU, CrossEntropyLoss, Sequential, Conv2d, MaxPool2d, Module, Softmax, BatchNorm2d, Dropout, functional as F

Let's start with the Masked Self Attention

![Self Attention](images/self_attention.png)

In [1]:
from gpt2attention import SelfAttention

sa = SelfAttention(768, 12)

In [5]:
x = torch.randn(4, 50, 768)
sa.forward(x).shape

torch.Size([4, 50, 768])

In [6]:
MAX_CONTEXT = 128
dim = 768
n_heads = 12
embed_dim = 768
context = 50 # simulating the 50th word in the context
batch_size = 4

c_attn = Linear(dim, dim*3, bias=True) # W_q, W_k, W_v, that's why dim*3
c_proj = Linear(dim, dim, bias=True)

x = torch.randn(batch_size, context, embed_dim) # (batch_size, context, embed_dim)

In [34]:
def split_heads(x):
    return x.view(x.shape[0], x.shape[1], n_heads, dim//n_heads)

def merge_heads(x: torch.Tensor, num_heads, head_dim) -> torch.Tensor:
        x = x.contiguous()
        return x.view((x.shape[0], x.shape[1], num_heads * head_dim))

In [35]:
def attention(q, k, v, mask=None):
    w = torch.matmul(q.transpose(1,2), k.transpose(1, 2).transpose(2, 3))
    w = w / torch.sqrt(torch.tensor(k.shape[-1]).float())
    print(f'w.shape: {w.shape}')
    print(f'q.shape: {q.shape}')
    print(f'k.shape: {k.shape}')
    print(f'v.shape: {v.shape}')
    if mask is not None:
        w = w + mask
    query_len = q.shape[1]
    key_len = k.shape[1]
    # Implementing the mask
    causal_mask = torch.tril(torch.ones((query_len, key_len), dtype=torch.bool))
    mask_value = torch.finfo(w.dtype).min # represent -inf
    w = torch.where(causal_mask, w, mask_value)
    print(f'w.shape: {w.shape}')
    
    w = F.softmax(w, dim=-1)
    print(f'w.shape after softmax: {w.shape}')
    print(f'v.shape: {v.shape}')
    attn_output = torch.matmul(w, v.transpose(1, 2)).transpose(1, 2)
    print(f'attn_output.shape: {attn_output.shape}')
    return attn_output

In [36]:
# Forward operation
xqkv = c_attn(x)
queries, keys, values = xqkv.split(dim, dim=2)
queries = split_heads(queries)
keys = split_heads(keys)
values = split_heads(values)

attn_output = attention(queries, keys, values)
attn_output = merge_heads(attn_output, n_heads, dim//n_heads)
attn_output = c_proj(attn_output)
attn_output.shape

# 4,50,768


w.shape: torch.Size([4, 12, 50, 50])
q.shape: torch.Size([4, 50, 12, 64])
k.shape: torch.Size([4, 50, 12, 64])
v.shape: torch.Size([4, 50, 12, 64])
w.shape: torch.Size([4, 12, 50, 50])
w.shape after softmax: torch.Size([4, 12, 50, 50])
v.shape: torch.Size([4, 50, 12, 64])
attn_output.shape: torch.Size([4, 50, 12, 64])


torch.Size([4, 50, 768])

In [15]:
tokenizer.encode('Hello, my dog is cute')

[15496, 11, 616, 3290, 318, 13779]

In [1]:
from transformer import Transformer
from dataset import TextDataset
from torch.utils.data import DataLoader
import torch
import tiktoken

model_size = "gpt2"
tokenizer = tiktoken.get_encoding(model_size)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = Transformer(
            dim=768,
            n_heads=12,
            vocab_size=50257,
            n_layers=12,
            max_seq_len=128,
            device=device
        )

  return torch._C._cuda_getDeviceCount() > 0


In [6]:
import numpy as np
import os
data_dir = "./data/shakespeare"
data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.int16, mode='r') 
dataset = TextDataset(data, tokenizer, max_length=128, input_type="bin")
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=0)

In [2]:
import os
input_file_path = os.path.join("./data/shakespeare/", 'input.txt')
with open(input_file_path, 'r') as f:
    data = f.read()
n = len(data)

dataset = TextDataset(data, tokenizer, max_length=128, input_type="text")
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=0)

NameError: name 'os' is not defined

In [47]:
dataset.inputs

[[37],
 [72],
 [81],
 [82],
 [83],
 [220],
 [34],
 [72],
 [83],
 [72],
 [89],
 [68],
 [77],
 [25],
 [198],
 [33],
 [68],
 [69],
 [78],
 [81],
 [68],
 [220],
 [86],
 [68],
 [220],
 [79],
 [81],
 [78],
 [66],
 [68],
 [68],
 [67],
 [220],
 [64],
 [77],
 [88],
 [220],
 [69],
 [84],
 [81],
 [83],
 [71],
 [68],
 [81],
 [11],
 [220],
 [71],
 [68],
 [64],
 [81],
 [220],
 [76],
 [68],
 [220],
 [82],
 [79],
 [68],
 [64],
 [74],
 [13],
 [198],
 [198],
 [32],
 [75],
 [75],
 [25],
 [198],
 [50],
 [79],
 [68],
 [64],
 [74],
 [11],
 [220],
 [82],
 [79],
 [68],
 [64],
 [74],
 [13],
 [198],
 [198],
 [37],
 [72],
 [81],
 [82],
 [83],
 [220],
 [34],
 [72],
 [83],
 [72],
 [89],
 [68],
 [77],
 [25],
 [198],
 [56],
 [78],
 [84],
 [220],
 [64],
 [81],
 [68],
 [220],
 [64],
 [75],
 [75],
 [220],
 [81],
 [68],
 [82],
 [78],
 [75],
 [85],
 [68],
 [67],
 [220],
 [81],
 [64],
 [83],
 [71],
 [68],
 [81],
 [220],
 [83],
 [78],
 [220],
 [67],
 [72],
 [68],
 [220],
 [83],
 [71],
 [64],
 [77],
 [220],
 [83],
 [78],
 [

In [38]:
for batch in dataloader:
    x, y = batch
    print(x.tolist())
    print(tokenizer.decode(x[1].tolist()))
    break

[[68], [77], [84], [220]]
n


### Training

In [8]:
import torch

optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)
num_epochs = 1

for epoch in range(num_epochs):
    for i, batch in enumerate(dataloader):
        inputs, labels = batch
        inputs.to(device)
        labels.to(device)
        
        print(f"batch: {i}")
        optimizer.zero_grad()
        logits, loss = model(inputs, labels=labels)
        print(f"loss: {loss}")
        loss.backward()
        optimizer.step()
        print(f"loss: {loss}")
        break

batch: 0
position_embeddings.shape: torch.Size([128, 768])
x.shape: torch.Size([4, 128])


IndexError: index out of range in self