## GPT from scratch based on the Transformer paper

This model is transformer's decoder only model.


In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/GptFromScratch

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/12SyDlQ1D6x-YEN8z-j54X8Bjsq6-jhyd/GptFromScratch


In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import tempfile

In [3]:
# hyperparameters
batch_size = 16 # number of independent sequences processed in parallel
block_size = 32 # number of maximum context length for predictions
max_iters = 160000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
eval_iters = 200
n_embed = 64
n_head = 4 # number of head for multi-heads (headの数)
n_layer = 4 # number of multi-head layers (multi-headのレイヤーの数)
dropout = 0.0
model_path = "./saved_model"
train_from_scratch = False
download_data = False
# ------------

print(device)

cuda


In [4]:
if download_data:
  # Get Data
  import urllib.request
  txt_urls = [
            # "https://s3.amazonaws.com/text-datasets/nietzsche.txt",
            # "https://raw.githubusercontent.com/ravexina/shakespeare-plays-dataset-scraper/master/shakespeare-db/All's%20Well%20That%20Ends%20Well.txt",
            "https://raw.githubusercontent.com/ErikaJacobs/Harry-Potter-Text-Mining/master/Book%20Text/HPBook1.txt",
            "https://raw.githubusercontent.com/ErikaJacobs/Harry-Potter-Text-Mining/master/Book%20Text/HPBook2.txt",
            "https://raw.githubusercontent.com/ErikaJacobs/Harry-Potter-Text-Mining/master/Book%20Text/HPBook3.txt",
            "https://raw.githubusercontent.com/ErikaJacobs/Harry-Potter-Text-Mining/master/Book%20Text/HPBook4.txt",
            "https://raw.githubusercontent.com/ErikaJacobs/Harry-Potter-Text-Mining/master/Book%20Text/HPBook5.txt",
            "https://raw.githubusercontent.com/ErikaJacobs/Harry-Potter-Text-Mining/master/Book%20Text/HPBook6.txt",
            "https://raw.githubusercontent.com/ErikaJacobs/Harry-Potter-Text-Mining/master/Book%20Text/HPBook7.txt",
            ]

  text = ""
  with tempfile.TemporaryDirectory() as tmpdirname:
    for txt_url_num in range(len(txt_urls)):
        !wget "{txt_urls[txt_url_num]}" -O "{tmpdirname}/train_{txt_url_num}.txt"
        !echo "{tmpdirname}/train_{txt_url_num}.txt"
        with open(f"{tmpdirname}/train_{txt_url_num}.txt", "r") as f:
          text += f.read()

  with open('train.txt', 'a') as f:
      f.write(text)

In [5]:
# create folder
import os
os.makedirs(model_path, exist_ok=True)

In [6]:
# Tokenize
class Tokenizer():
    def __init__(self, characters_list:list) -> None:
        self.characters_list = characters_list

    # map string to int
    def mapStringToInt(self):
        self.MapStrToInt = {ch:i for i, ch in enumerate(self.characters_list)}
        return self.MapStrToInt

    # map to int to string
    def mapIntToString(self):
        self.MapIntToStr = {i:ch for i, ch in enumerate(self.characters_list)}
        return self.MapIntToStr

    # string to int
    def encoder(self, string:str):
        CharMaps=self.MapStrToInt
        return [CharMaps[c] for c in string]

    # int to string
    def decoder(self, int_list:list):
        CharMaps=self.MapIntToStr
        return ''.join([CharMaps[i] for i in int_list])


In [7]:
# Training and validate
class DataLoader():
    def __init__(self, data, block_size, batch_size) -> None:
        self.data = data
        self.block_size = block_size
        self.batch_size = batch_size

    # split data into validation and training data.
    def splitData(self, splitPercentage=0.9):
        n = int(splitPercentage*len(self.data))
        self.val_data = self.data[n:]
        self.train_data = self.data[:n]

        return self.train_data, self.val_data

    # Get chunk of data
    def getBlock(self, start, end, data_list:list):
        return data_list[start:end]

    # get batch
    def get_batch(self, split):
        # choose train_data or val_data
        data = self.train_data if split == 'train' else self.val_data
        # generate batch size random numbers
        ix = torch.randint(len(data) - self.block_size, (self.batch_size,))
        # first block size words
        # stack to create new dimension
        x = torch.stack([self.getBlock(start=i, end=i+self.block_size, data_list=self.data) for i in ix])
        # offset of x
        y = torch.stack([self.getBlock(start=i+1, end=i+self.block_size+1, data_list=self.data) for i in ix])
        x, y = x.to(device), y.to(device)
        return x, y

In [8]:
with open("train.txt", "r", encoding='utf-8') as f:
    text = f.read()

print(f"Len(train.txt): {len(text)}")

Len(train.txt): 25300256


In [9]:
# characters inside the training data.
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)
print(chars)

	
 !"$%&'()*,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz|}~ ¦¨–—‘’“”…
104
['\t', '\n', ' ', '!', '"', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '|', '}', '~', '\xa0', '¦', '¨', '–', '—', '‘', '’', '“', '”', '…']


In [10]:
tokenizer = Tokenizer(characters_list = chars)

MapStrToInt = tokenizer.mapStringToInt()
MapIntToStr = tokenizer.mapIntToString()

print(tokenizer.encoder(string="hi there"))
print(tokenizer.decoder(int_list = tokenizer.encoder(string="hi there")))

[72, 73, 2, 84, 72, 69, 82, 69]
hi there


In [11]:
# Encode the train.txt and wrap it in tensor
data = torch.tensor(tokenizer.encoder(text), dtype=torch.long)
print(f"data.shape: {data.shape}")

data.shape: torch.Size([25300256])


In [12]:
dataLoader = DataLoader(data = data,
                        block_size= block_size,
                        batch_size= batch_size)

In [13]:
# Split the data
train_data, val_data = dataLoader.splitData()

In [14]:
# Split the data to chunk
# maximum context length for predictions

x = dataLoader.getBlock(data_list=train_data, start=0, end=block_size)
y = dataLoader.getBlock(data_list=train_data, start=1, end=block_size+1)

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context} the target is {target}")

When input is tensor([4]) the target is 52
When input is tensor([ 4, 52]) the target is 69
When input is tensor([ 4, 52, 69]) the target is 88
When input is tensor([ 4, 52, 69, 88]) the target is 84
When input is tensor([ 4, 52, 69, 88, 84]) the target is 4
When input is tensor([ 4, 52, 69, 88, 84,  4]) the target is 32
When input is tensor([ 4, 52, 69, 88, 84,  4, 32]) the target is 4
When input is tensor([ 4, 52, 69, 88, 84,  4, 32,  4]) the target is 35
When input is tensor([ 4, 52, 69, 88, 84,  4, 32,  4, 35]) the target is 72
When input is tensor([ 4, 52, 69, 88, 84,  4, 32,  4, 35, 72]) the target is 65
When input is tensor([ 4, 52, 69, 88, 84,  4, 32,  4, 35, 72, 65]) the target is 80
When input is tensor([ 4, 52, 69, 88, 84,  4, 32,  4, 35, 72, 65, 80]) the target is 84
When input is tensor([ 4, 52, 69, 88, 84,  4, 32,  4, 35, 72, 65, 80, 84]) the target is 69
When input is tensor([ 4, 52, 69, 88, 84,  4, 32,  4, 35, 72, 65, 80, 84, 69]) the target is 82
When input is tensor([ 

In [15]:
# generate random numbers and seed it so the numbers wont change every time we execute
torch.manual_seed(1337)

# number of independent sequences for parallel process
xb, yb = dataLoader.get_batch(split='train')
print("inputs:")
print(xb.shape)
print(xb)

print("targets:")
print(yb.shape)
print(yb)

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b:t+1]
        target = yb[b,t]
        print(f"When input is {context.tolist()} the target is {target}")

inputs:
torch.Size([16, 32])
tensor([[12,  8,  2, 83, 65, 73, 68,  2, 40, 69, 82, 77, 73, 79, 78, 69, 12,  2,
         84, 87, 73, 83, 84, 73, 78, 71,  2, 72, 69, 82,  2, 70],
        [83, 84,  2, 40, 65, 71, 82, 73, 68,  2, 84, 79, 79, 14,  8, 94,  2, 94,
          2, 52, 72, 69, 89,  2, 84, 82, 65, 73, 80, 83, 69, 68],
        [84, 72, 69,  2, 83, 67, 72, 79, 79, 76, 14,  2, 60,  4, 40, 69,  2, 83,
         85, 82, 69, 76, 89,  2, 68, 79, 69, 83, 78,  8, 84,  2],
        [79, 76, 68, 73, 78, 71,  2, 79, 85, 84,  2, 70, 79, 82,  2, 85, 78, 73,
         86, 69, 82, 83, 65, 76,  2, 80, 79, 80, 85, 76, 65, 82],
        [77, 14,  2, 41, 78, 68, 69, 69, 68, 12,  2, 48, 82, 79, 70, 69, 83, 83,
         79, 82,  2, 45, 67, 39, 79, 78, 65, 71, 65, 76, 76,  2],
        [85, 78, 68, 73, 78, 71,  2, 83, 73, 78, 67, 69,  2, 84, 72, 69,  2, 80,
         82, 69, 86, 73, 79, 85, 83,  2, 89, 69, 65, 82, 14, 94],
        [ 2, 80, 76, 65, 78, 69, 84,  3,  2, 47, 79, 79, 72, 12,  2, 87, 72, 73,
        

In [16]:
print(xb)

tensor([[12,  8,  2, 83, 65, 73, 68,  2, 40, 69, 82, 77, 73, 79, 78, 69, 12,  2,
         84, 87, 73, 83, 84, 73, 78, 71,  2, 72, 69, 82,  2, 70],
        [83, 84,  2, 40, 65, 71, 82, 73, 68,  2, 84, 79, 79, 14,  8, 94,  2, 94,
          2, 52, 72, 69, 89,  2, 84, 82, 65, 73, 80, 83, 69, 68],
        [84, 72, 69,  2, 83, 67, 72, 79, 79, 76, 14,  2, 60,  4, 40, 69,  2, 83,
         85, 82, 69, 76, 89,  2, 68, 79, 69, 83, 78,  8, 84,  2],
        [79, 76, 68, 73, 78, 71,  2, 79, 85, 84,  2, 70, 79, 82,  2, 85, 78, 73,
         86, 69, 82, 83, 65, 76,  2, 80, 79, 80, 85, 76, 65, 82],
        [77, 14,  2, 41, 78, 68, 69, 69, 68, 12,  2, 48, 82, 79, 70, 69, 83, 83,
         79, 82,  2, 45, 67, 39, 79, 78, 65, 71, 65, 76, 76,  2],
        [85, 78, 68, 73, 78, 71,  2, 83, 73, 78, 67, 69,  2, 84, 72, 69,  2, 80,
         82, 69, 86, 73, 79, 85, 83,  2, 89, 69, 65, 82, 14, 94],
        [ 2, 80, 76, 65, 78, 69, 84,  3,  2, 47, 79, 79, 72, 12,  2, 87, 72, 73,
         67, 72,  2, 79, 78, 69,  8, 

For example: <br>
For the first one (67), BigramLangModel will look for the 67th row of the Embedding table

最初の行（67）に対して、BigramLangModelは埋め込みテーブルの67行目を探す。

```
tensor([[67, 65,  0, 53, 54, 71, 67, 64],  
        [ 1,  1, 32,  1, 53, 65,  7,  1],  
        [53, 64,  1, 72, 60, 53, 72,  1],  
        [61, 67, 66,  1, 53, 66, 56,  1]])   
```

### Single Head

Scaled Dot-Product Attention <br>
https://paperswithcode.com/method/scaled

Figure2 (Left) in the ["Attention Is All You Need" paper](https://arxiv.org/pdf/1706.03762v5.pdf)

![image.png](./img/ScaledDotProduct.png)


In [17]:
# Single Head
class Head(nn.Module):
    def __init__(self,  head_size):
        super().__init__()
        self.key = nn.Linear(n_embed, head_size, bias=False)
        self.query = nn.Linear(n_embed, head_size, bias=False)
        self.value = nn.Linear(n_embed, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def ScaledDotProductAttention(self, k, q, B, T, C):
        # MatMul + Scale
        weight = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, head_size) @ (B, head_size, T) --> (B, T, T)
        # Mask (opt.)
        weight = weight.masked_fill(self.tril[:T,:T] == 0, float('-inf'))  # (B, T, T) # setting it to infinity, we will not aggregate the tokens from the past, in other words future cannot communicate from the past
        # SoftMax
        weight = F.softmax(weight, dim = -1) # Normalize weight

        return weight

    def forward(self, x):
        B,T,C = x.shape

        k = self.key(x) # (B, T, head_size)
        q = self.query(x) # (B, T, head_size)

        # compute attention scores # Scale Dot-Product Attention
        weight = self.ScaledDotProductAttention(k, q, B, T, C)
        weight = self.dropout(weight)

        # perform the weighted aggregation of the value
        v = self.value(x) # Thing that get aggregated for the purposes of the single head
        xBagOfWords = weight @ v

        return xBagOfWords




![image.png](./img/MultiHeadAttention.png)

In [18]:
# multi heads of self attention in parallel
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embed, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # Scaled Dot Product Attention and concat
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        # Linear layer at the end
        out = self.proj(out)
        out = self.dropout(out)

        return out

## Blocks

Way to prevent degradation problem, by skipping some part of the layers.

Degradation problem: <br>
Shallower networks perform better than the deeper counterparts with few more layers added to them

https://arxiv.org/abs/1512.03385

https://paperswithcode.com/method/residual-block

https://towardsdatascience.com/residual-blocks-building-blocks-of-resnet-fd90ca15d6ec



In [19]:
# a simple linear layer followed by a non-linearity
class FeedFoward(nn.Module):
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embed, 4 * n_embed),
            nn.ReLU(),
            nn.Linear(4 * n_embed, n_embed),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [20]:
# Transformer block:  communication followed by computation
class Block(nn.Module):
    def __init__(self, n_embed, n_head):
        #  n_embed : embedding demensions,
        #  n_head: the number of heads
        super().__init__()
        head_size = n_embed // n_head
        self.sa = MultiHeadAttention(num_heads = n_head, head_size = head_size)
        self.ffwd = FeedFoward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, x):
        '''
        Different from the "All You Need Is Attention" paper.
        In transformer paper, Norm Layer is located after the multihead attention but this time it is located after the multihead attention.
        '''
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))

        return x


In [21]:
class BigramLangModel(nn.Module):
    def __init__(self, vocab_size, n_embed) -> None:
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed) # Embedding table for position of the token
        self.blocks = nn.Sequential(*[Block(n_embed, n_head=n_head) for _ in range(n_layer)])
        self.lm_head = nn.Linear(n_embed, vocab_size)
        self.ln_f = nn.LayerNorm(n_embed)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both tensor
        tok_emb = self.token_embedding_table(idx) # (Batch(batch size), Time(block size), Channel(embed))
        pos_emb = self.position_embedding_table(torch.arange(T,device=device)) # (T,C) integers of 0 ~ T-1
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # apply one head of self-attention (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (Batch(batch size), Time(block size), vocab size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape # (Batch(batch size), Time(block size), Channel(embed))
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B,T) array of indices in current context
        for _ in range(max_new_tokens):
            # crop idx to the last block size tokens # we can never have more than block size
            idx_cond = idx[:,-block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last step
            logits = logits[:, -1, :]
            # apply softmax to the probabilities
            probs = F.softmax(logits, dim =-1) #(B,C)
            # sample from distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            #append sampled index to running sequence
            idx = torch.cat((idx, idx_next), dim = 1)
        return idx


In [22]:
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = dataLoader.get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


In [23]:
if train_from_scratch:

  model = BigramLangModel(vocab_size = vocab_size, n_embed=n_embed)
  model.to(device)


  # print the number of parameters in the model
  print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

  # create a PyTorch optimizer
  optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

  for iter in range(max_iters):

      # every once in a while evaluate the loss on train and val sets
      if iter % eval_interval == 0 or iter == max_iters - 1:
          losses = estimate_loss(model)
          print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

      # sample a batch of data
      xb, yb = dataLoader.get_batch('train')

      # evaluate the loss
      logits, loss = model(xb, yb)
      optimizer.zero_grad(set_to_none=True)
      loss.backward()
      optimizer.step()

  torch.save({
            'epoch' : max_iters,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': losses['train']
            }, f"{model_path}/{max_iters}")

  # generate from the model
  context = torch.zeros((1, 1), dtype=torch.long, device=device)
  print(tokenizer.decoder(model.generate(context, max_new_tokens=2000)[0].tolist()))

In [24]:

import glob
import re
saved_models = glob.glob(f"{model_path}/*")
print(saved_models)
saved_iters = [re.findall(f'{model_path}/(.*)', saved_model)[0] for saved_model in saved_models]
print(saved_iters)
saved_iters = [int(iter) for iter in saved_iters]

saved_iters.sort(reverse = True)

print(saved_iters)

train_from_scratch = False
check_iters = saved_iters[0]
max_iters = 230000

if not train_from_scratch:
  device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'

  model = BigramLangModel(vocab_size = vocab_size, n_embed=n_embed)
  model.to(device)

  # create a PyTorch optimizer
  optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

  checkpoint = torch.load(f"{model_path}/{check_iters}")
  model.load_state_dict(checkpoint['model_state_dict'])
  optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
  checkpoint_epoch = checkpoint['epoch']
  checkpoint_loss = checkpoint['loss']
  print(f"epoch: {checkpoint_epoch} loss: {checkpoint_loss}")


  # print the number of parameters in the model
  print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')


  for iter in range(checkpoint_epoch, max_iters):

      # every once in a while evaluate the loss on train and val sets
      if iter % eval_interval == 0 or iter == max_iters - 1:
          losses = estimate_loss(model)
          print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

      # sample a batch of data
      xb, yb = dataLoader.get_batch('train')

      # evaluate the loss
      logits, loss = model(xb, yb)
      optimizer.zero_grad(set_to_none=True)
      loss.backward()
      optimizer.step()

  torch.save({
            'epoch' : max_iters,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': losses['train']
            }, f"{model_path}/{max_iters}")

  # generate from the model
  context = torch.zeros((1, 1), dtype=torch.long, device=device)
  print(tokenizer.decoder(model.generate(context, max_new_tokens=2000)[0].tolist()))


['./saved_model/10000', './saved_model/100000', './saved_model/150000', './saved_model/160000', './saved_model/200000']
['10000', '100000', '150000', '160000', '200000']
[200000, 160000, 150000, 100000, 10000]
epoch: 200000 loss: 1.3065040111541748
0.21476 M parameters
step 200000: train loss 1.2983, val loss 1.2801
step 200100: train loss 1.2991, val loss 1.2984
step 200200: train loss 1.3099, val loss 1.2914
step 200300: train loss 1.2995, val loss 1.2955
step 200400: train loss 1.3096, val loss 1.2972
step 200500: train loss 1.2915, val loss 1.2864
step 200600: train loss 1.2953, val loss 1.2873
step 200700: train loss 1.3086, val loss 1.2923
step 200800: train loss 1.2939, val loss 1.2957
step 200900: train loss 1.3047, val loss 1.2955
step 201000: train loss 1.3072, val loss 1.2778
step 201100: train loss 1.3008, val loss 1.2842
step 201200: train loss 1.2955, val loss 1.2917
step 201300: train loss 1.2999, val loss 1.2910
step 201400: train loss 1.3047, val loss 1.2947
step 20150