Byte Pair Encoding (BPE) is a text compression technique that iteratively replaces the most frequent pair of bytes in a sequence with a single, unused byte. Adapted for natural language processing, BPE is used in tokenization, where it iteratively merges the most frequent pair of adjacent characters or character sequences (tokens) into a single new token. This approach allows handling of common words as single entities and rare words or names to be broken down into smaller pieces, improving the model's ability to understand and generate text by efficiently managing vocabulary size and out-of-vocabulary words.

In [27]:
!pip install transformers tiktoken



In [2]:
import os
import json
import regex as re
import requests
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
import pickle
import math
import time
from collections import defaultdict
import tiktoken

In [3]:
# download the tiny shakespeare dataset
data_dir = os.path.join('data', 'tinyshakespeare')
input_file_path = os.path.join(data_dir, 'input.txt')
if not os.path.exists(input_file_path):
    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    os.makedirs(data_dir)
    with open(input_file_path, 'w') as f:
        f.write(requests.get(data_url).text)

with open(input_file_path, 'r') as f:
    data = f.read()
n = len(data)
print("n",n," ",(n*0.9))
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]
print("train_data",train_data[0:30])
# encode with tiktoken gpt2 bpe
enc = tiktoken.get_encoding("gpt2")
train_ids = enc.encode_ordinary(train_data)
val_ids = enc.encode_ordinary(val_data)
print(f"train has {len(train_ids):,} tokens")
print(f"val has {len(val_ids):,} tokens")

# export to bin files
train_ids = np.array(train_ids, dtype=np.uint16)
val_ids = np.array(val_ids, dtype=np.uint16)
train_ids.tofile(os.path.join(data_dir, 'train.bin'))
val_ids.tofile(os.path.join(data_dir, 'val.bin'))

n 1115394   1003854.6
train_data First Citizen:
Before we proce
train has 301,966 tokens
val has 36,059 tokens


In [69]:
class GPTConfig:
    def __init__(self, vocab_size, **kwargs):
        self.vocab_size = vocab_size
        for key, value in kwargs.items():
            setattr(self, key, value)

class CustomConfig(GPTConfig):
    # model
    n_layer = 5 #increase number of decoder layer more than 1
    n_head = 8
    n_embd = 256
    embd_pdrop = 0.1
    resid_pdrop = 0.1
    attn_pdrop = 0.1
    dropout = 0.1
    compile = True

    # data
    device = 'cuda'
    num_workers = 0

    # optimizer parameters
    max_iters = 2e4
    batch_size = 4
    block_size = 64
    learning_rate = 6e-4
    betas = (0.9, 0.95)
    weight_decay = 1e-1
    grad_norm_clip = 1.0

# config
vocab_size = len(train_ids)
config = CustomConfig(vocab_size=vocab_size)

In [70]:
data_dir = os.path.join('data', 'tinyshakespeare')
train_data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
val_data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')

class ShakespeareDataset(Dataset):
    def __init__(self, split, block_size=128, device_type='cuda'):
        assert split in {'train', 'test'}
        self.split = split
        self.block_size = block_size
        self.device_type = device_type
        self.data = train_data if split == 'train' else val_data

    def __len__(self):
        return len(self.data) - self.block_size

    def __getitem__(self, idx):
        # ix = torch.randint(len(data) - block_size, (batch_size,))
        # x = torch.stack([torch.from_numpy((data[i:i+block_size]).astype(np.int64)) for i in ix])
        # y = torch.stack([torch.from_numpy((data[i+1:i+1+block_size]).astype(np.int64)) for i in ix])
        x = torch.from_numpy(self.data[idx : idx + self.block_size].astype(np.int64))
        y = torch.from_numpy(self.data[idx + 1 : idx + 1 + self.block_size].astype(np.int64))


        x, y = x.to('cpu'), y.to('cpu')
        return x, y

train_dataset = ShakespeareDataset('train', config.block_size, config.device)
train_loader = DataLoader(train_dataset, batch_size=config.batch_size, num_workers=config.num_workers, drop_last=False)
test_dataset = ShakespeareDataset('test', config.block_size, config.device)
test_loader = DataLoader(test_dataset, batch_size=config.batch_size, num_workers=config.num_workers, drop_last=False)
sample_data = next(iter(train_loader))
x, y = sample_data
print("x:", x.size())
print("y:", y.size())
print("X",x[0])

x: torch.Size([4, 64])
y: torch.Size([4, 64])
X tensor([ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252,    11,
         3285,   502,  2740,    13,   198,   198,  3237,    25,   198,  5248,
          461,    11,  2740,    13,   198,   198,  5962, 22307,    25,   198,
         1639,   389,   477, 12939,  2138,   284,  4656,   621,   284,  1145,
          680,    30,   198,   198,  3237,    25,   198,  4965,  5634,    13,
        12939,    13,   198,   198,  5962, 22307,    25,   198,  5962,    11,
          345,   760,   327,  1872])


In [71]:
class NewGELU(nn.Module):
    """
    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT).
    Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
    """
    def forward(self, x):
        return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))


class CausalSelfAttention(nn.Module):
    """
    A vanilla multi-head masked self-attention layer with a projection at the end.
    It's important in decoder block to have diagonal mask
    It is also possible to use torch.nn.MultiheadAttention.
    """

    def __init__(self, config):
        super().__init__()
        ''' The assert statement checks that the model's embedding dimension (config.n_embd) is
         divisible by the number of attention heads (config.n_head). This is crucial because, in
         multi-head attention mechanisms, the embedding dimension must be evenly split among the heads for parallel processing.
         If this condition is not met, the assert statement raises an AssertionError, halting execution.
        This ensures the model configuration is valid for the intended architecture.'''
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch

        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd) #256*(3*256)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)  #256*256
        # regularization
        self.attn_dropout = nn.Dropout(config.attn_pdrop)
        self.resid_dropout = nn.Dropout(config.resid_pdrop)
        self.dropout = config.dropout
        self.n_head = config.n_head  #8
        self.n_embd = config.n_embd

        # flash attention make GPU go faster but support is only in PyTorch >= 2.0
        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
        if not self.flash:
            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
            # causal mask to ensure that attention is only applied to the left in the input sequence
            # use register_buffer when: you want a stateful part of your model that is not a parameter
            # but you want it in your state_dict
            self.register_buffer(
                "mask",
                torch.tril(torch.ones(config.block_size, config.block_size)
            ).view(1, 1, config.block_size, config.block_size))
            """ If the block_size is 64, the output tensor's size after applying
            torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size) will be [1, 1, 64, 64].
             This represents a 4-dimensional tensor where the lower triangular matrix of size 64x64 is reshaped to have two additional leading dimensions
             of size 1, primarily for batch and head dimensions compatibility in attention mechanisms."""
    def forward(self, x):
        # batch_size, seq_len, emb_dim
        B, T, C = x.size()
        print("B, T, C",x.size(),B) #4*64*256
        print("self.c_attn(x)",self.c_attn(x).size()) #4*64*768
        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        # (b, seq_len, emb_dim) --> (b, seq_len, emb_dim * 3) --> (b, seq_len, emb_dim)
        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
        print(k.size())
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (b, h, seq_len, d_k)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (b, h, seq_len, d_k)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (b, h, seq_len, d_k)
        print("K Size",k.size(),k.shape)
        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
        """Causal self-attention is a mechanism in transformers that restricts the model from attending to future tokens in the input sequence,
        ensuring each position can only be influenced by itself and preceding positions. This is crucial for generating coherent sequences in tasks like language modeling, where the prediction for a current token must
         not depend on future tokens, maintaining the sequence's causal structure. """

        if self.flash:
            # efficient attention using Flash Attention CUDA kernels
            y = torch.nn.functional.scaled_dot_product_attention(
                q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True
            )
        else:
            # (b, h, seq_len, d_k) matmul (b, h, d_k, seq_len) --> (b, h, seq_len, seq_len)
            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
            print("ATT",att)
            # diagonal mask
            # fill 0 mask with super small number so it wont affect the softmax weight
            # (batch_size, h, seq_len, seq_len)
            att = att.masked_fill(self.mask[:,:,:T,:T] == 0, float('-inf'))
            print("ATT_maskes",att)
            att = F.softmax(att, dim=-1)
            att = self.attn_dropout(att)

            # (b, h, seq_len, seq_len) matmul (b, h, seq_len, d_k) --> (b, h, seq_len, d_k)
            y = att @ v

        # (b, h, seq_len, d_k) --> (b, seq_len, h, d_k) --> (b, seq_len, d_model)
        print("Y-Before linear",y.size())#4*8*64*32
        y = y.transpose(1, 2).contiguous().view(B, T, C)#4*64*8*32

        # output projection
        y = self.resid_dropout(self.c_proj(y))
        print("Y",y.size())
        return y


In [72]:
class Block(nn.Module):
    """ GPT only contain decode block"""

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)


        self.mlp = nn.ModuleDict(dict(
            c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd),
            act     = NewGELU(),
            c_proj  = nn.Linear(4 * config.n_embd, config.n_embd),
            dropout = nn.Dropout(config.resid_pdrop),
        ))
        m = self.mlp
        self.mlpf = lambda x: m.dropout(m.c_proj(m.act(m.c_fc(x))))

    def forward(self, x):

        # (batch_size, seq_len, emb_dim)
        print("ln_1",self.ln_1(x).size())
        x = x + self.attn(self.ln_1(x))
        print("Middle x",x.size())
        print("MLPF",self.mlpf(self.ln_2(x)).size())
        x = x + self.mlpf(self.ln_2(x))
        return x

### testing
wte = nn.Embedding(config.vocab_size, config.n_embd)
block = Block(config)

tok_emb = wte(x)
print('Token Embedding Size:', tok_emb.size())

block_out = block(tok_emb)
print('Block Output Size:', block_out.size())

Token Embedding Size: torch.Size([4, 64, 256])
ln_1 torch.Size([4, 64, 256])
B, T, C torch.Size([4, 64, 256]) 4
self.c_attn(x) torch.Size([4, 64, 768])
torch.Size([4, 64, 256])
K Size torch.Size([4, 8, 64, 32]) torch.Size([4, 8, 64, 32])
Y-Before linear torch.Size([4, 8, 64, 32])
Y torch.Size([4, 64, 256])
Middle x torch.Size([4, 64, 256])
MLPF torch.Size([4, 64, 256])
Block Output Size: torch.Size([4, 64, 256])


In [92]:
import torch._dynamo
torch._dynamo.config.suppress_errors = True
class GPT(nn.Module):
    """ GPT Language Model """

    def __init__(self, config):
        super().__init__()
        self.block_size = config.block_size

        self.positional_encoding = self._get_positional_encoding(config.block_size, config.n_embd)
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            # wpe = nn.Embedding(config.block_size, config.n_embd),
            drop = nn.Dropout(config.embd_pdrop),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # init all weights, and apply a special scaled init to the residual projections, per GPT-2 paper
        self.apply(self._init_weights)
        print("named_parameters",self.named_parameters)
        for pn, p in self.named_parameters():
            if pn.endswith('c_proj.weight'):
                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))

        # report number of parameters (note we don't count the decoder parameters in lm_head)
        n_params = sum(p.numel() for p in self.transformer.parameters())
        print("number of parameters: %.2fM" % (n_params/1e6,))

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
        elif isinstance(module, nn.LayerNorm):
            torch.nn.init.zeros_(module.bias)
            torch.nn.init.ones_(module.weight)

    def _get_positional_encoding(self, max_seq_len, embed_size):
        positional_encoding = torch.zeros(max_seq_len, embed_size)
        position = torch.arange(0, max_seq_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, embed_size, 2).float() * -(math.log(10000.0) / embed_size))
        positional_encoding[:, 0::2] = torch.sin(position * div_term)
        positional_encoding[:, 1::2] = torch.cos(position * div_term)
        return positional_encoding

    def configure_optimizers(self, train_config):

        # separate out all parameters to those that will and won't experience regularizing weight decay
        decay = set()
        no_decay = set()
        whitelist_weight_modules = (torch.nn.Linear, )
        blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
        for mn, m in self.named_modules():
            for pn, p in m.named_parameters():
                fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
                # random note: because named_modules and named_parameters are recursive
                # we will see the same tensors p many many times. but doing it this way
                # allows us to know which parent module any tensor p belongs to...
                if pn.endswith('bias'):
                    # all biases will not be decayed
                    no_decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
                    # weights of whitelist modules will be weight decayed
                    decay.add(fpn)
                elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
                    # weights of blacklist modules will NOT be weight decayed
                    no_decay.add(fpn)
        print("Pn",pn)
        print("decay",decay)
        # validate that we considered every parameter
        param_dict = {pn: p for pn, p in self.named_parameters()}
        inter_params = decay & no_decay
        union_params = decay | no_decay

        # create the pytorch optimizer object
        optim_groups = [
            {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_config.weight_decay},
            {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
        ]
        optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_config.betas)
        return optimizer

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is only {self.block_size}"
        tok_emb = self.transformer.wte(idx) # (b, t, n_embd)
        pos_emb = self.positional_encoding[:t, :].unsqueeze(0) # (1, t, n_embd)

        # positional token, shape (1, t)
        # pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0)
        # pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)

        x = self.transformer.drop(tok_emb + pos_emb)
        for block in self.transformer.h:
            x = block(x)

        x = self.transformer.ln_f(x)
        # (b, t, n_embd) -- > # (b, t, vocab_size)
        logits = self.lm_head(x)

        # if we are given some desired targets also calculate the loss
        # -1 at output will be ignored
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
        return logits, loss

    @torch.no_grad()
    def generate(self, idx, max_new_tokens, temperature=1.0, do_sample=False, top_k=None):
        """
        Take a conditioning sequence of indices idx (LongTensor of shape (b, t)) and complete
        the sequence max_new_tokens times, feeding the predictions back into the model each time.
        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
        """
        for _ in range(max_new_tokens):
            # if the sequence context is growing too long we must crop it at block_size
            idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]

            # forward the model to get the logits for the index in the sequence
            logits, _ = self(idx_cond)

            # pluck the logits at the final step and scale by desired temperature
            logits = logits[:, -1, :] / temperature
            print("LOGITS",logits)
            logits = torch.exp(logits) / torch.sum(np.exp(logits), axis=-1, keepdims=True)

            # optionally crop the logits to only the top k options
            if top_k is not None:
                # from [0.2769, 0.1019, 0.3351, 0.1967, 0.0882] --> [0.452, 0.547]
                v, _ = torch.topk(logits, top_k)
                logits[logits < v[:, [-1]]] = -float('Inf')

            # apply softmax to convert logits to (normalized) probabilities
            print("Logits2",logits)
            probs = F.softmax(logits, dim=-1)
            print("probs",probs,probs.shape)
            if do_sample:
                # select from [0.1, 0.2, 0.3, 0.2, 0.2] --> [2, 4] (if num_samples=2)
                idx_next = torch.multinomial(probs, num_samples=1)
            else:
                # take the top probs index
                _, idx_next = torch.topk(probs, k=1, dim=-1)

            # append sampled index to the running sequence and continue
            idx = torch.cat((idx, idx_next), dim=1)
            print("IDX",idx)  #keep adding the idx value
        return idx

### testing
wte = nn.Embedding(config.vocab_size, config.n_embd)
model = GPT(config)
model = torch.compile(model)

# sample dataset from data loader
logits, loss = model.forward(x, y)
print('logits: ', logits.size())
print('loss: ', loss)

named_parameters <bound method Module.named_parameters of GPT(
  (transformer): ModuleDict(
    (wte): Embedding(301966, 256)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=256, out_features=768, bias=True)
          (c_proj): Linear(in_features=256, out_features=256, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (mlp): ModuleDict(
          (c_fc): Linear(in_features=256, out_features=1024, bias=True)
          (act): NewGELU()
          (c_proj): Linear(in_features=1024, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  

In [None]:
class Trainer:

    def __init__(self, config, model, train_dataset):
        self.config = config
        self.model = model
        self.optimizer = None
        self.train_dataset = train_dataset
        self.callbacks = defaultdict(list)
        self.device = config.device
        self.model = self.model

        # variables that will be assigned to trainer class later for logging and etc
        self.iter_num = 0
        self.iter_time = 0.0
        self.iter_dt = 0.0

    def add_callback(self, onevent: str, callback):
        self.callbacks[onevent].append(callback)

    def set_callback(self, onevent: str, callback):
        self.callbacks[onevent] = [callback]

    def trigger_callbacks(self, onevent: str):
        for callback in self.callbacks.get(onevent, []):
            callback(self)

    def run(self):
        model, config = self.model, self.config

        # setup the optimizer
        self.optimizer = model.configure_optimizers(config)

        # setup the dataloader
        train_loader = DataLoader(
            self.train_dataset,
            sampler=torch.utils.data.RandomSampler(self.train_dataset, replacement=True, num_samples=int(1e10)),
            shuffle=False,
            # pin_memory=True,
            batch_size=config.batch_size,
            num_workers=config.num_workers,
        )

        model.train()
        self.iter_num = 0
        self.iter_time = time.time()
        data_iter = iter(train_loader)
        while True:

            # fetch the next batch (x, y) and re-init iterator if needed
            try:
                batch = next(data_iter)
            except StopIteration:
                data_iter = iter(train_loader)
                batch = next(data_iter)
            #batch = [t.to(self.device) for t in batch]
            x, y = batch

            # forward the model
            logits, self.loss = model(x, y)

            # backprop and update the parameters
            model.zero_grad(set_to_none=True)
            self.loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), config.grad_norm_clip)
            self.optimizer.step()

            self.trigger_callbacks('on_batch_end')
            self.iter_num += 1
            tnow = time.time()
            self.iter_dt = tnow - self.iter_time
            self.iter_time = tnow

            # termination conditions
            if config.max_iters is not None and self.iter_num >= config.max_iters:
                break

model = GPT(config)
trainer = Trainer(config, model, train_dataset)
trainer = Trainer(config, model, train_dataset)

def batch_end_callback(trainer):
    if trainer.iter_num % 5 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

In [None]:
text = 'ou are all resolved rather to '
sample_ids = torch.Tensor(enc.encode_ordinary(text)).long()
sample_ids = torch.unsqueeze(sample_ids, 0)
result = model.generate(sample_ids, max_new_tokens=50, temperature=1, do_sample=False, top_k=None)

#print(enc.decode(result.detach().tolist()[0]))

In [80]:
torch.save(model.state_dict(), 'model_gpt.pth')

In [82]:
print(enc.decode(result.detach().tolist()[0]))

Lord:
Rise! My people, conquer the north!


















































