<a href="https://colab.research.google.com/github/pranay8297/llm/blob/main/gpt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tiktoken
!pip install triton

In [1]:
import math
import torch
import tiktoken
import time

from torch import nn
from torch.nn import functional as F
from dataclasses import dataclass
from einops import rearrange

# Building Blocks

In [2]:
@dataclass
class GPTConfig:
    block_size: int = 1024
    vocab_size: int = 50257
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768

enc = tiktoken.get_encoding('gpt2')

torch.manual_seed(1337)
if torch.cuda.is_available():
    torch.cuda.manual_seed(1337)

device = 'cpu'
if torch.cuda.is_available(): device = 'cuda'


In [3]:
class CausalSelfAttentionOrig(nn.Module):

    def __init__(self, config):

        # without Falsh Attention

        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

        self.n_head = config.n_head
        self.n_embd = config.n_embd

        self.register_buffer('bias', torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size()
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)

        k = rearrange(k, 'u v (w x) -> u w v x', w = self.n_head)
        q = rearrange(q, 'u v (w x) -> u w v x', w = self.n_head)
        v = rearrange(v, 'u v (w x) -> u w v x', w = self.n_head)

        sim = q@k.transpose(-1, -2)/math.sqrt(x.shape[-1])
        attn = sim.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
        attn = F.softmax(attn, dim = -1)

        y = attn @ v
        y = rearrange(y, 'u w v x -> u v (w x)')

        y = self.c_proj(y)
        return y

class CasualSelfAttn(nn.Module):

    def __init__(self, config: GPTConfig):

        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

        self.n_head = config.n_head
        self.n_embd = config.n_embd

    def forward(self, x: torch.tensor):

        B, T, C = x.size()
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True)
        y = y.transpose(1, 2).contiguous().view(B, T, C)

        y = self.c_proj(y)
        return y

class MLP(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4*config.n_embd)
        self.act = nn.GELU(approximate = 'tanh')
        self.c_proj = nn.Linear(4*config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x: torch.tensor):

        x = self.c_fc(x) # B x block_size x 4*n_embd
        x = self.act(x) # B x block_size x 4*n_embd
        x = self.c_proj(x) # B x block_size x n_embd
        return x

class DecoderBlock(nn.Module):
    def __init__(self, config: GPTConfig):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CasualSelfAttn(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x: torch.tensor):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

class GPT(nn.Module):

    '''
    embeddings -> Casual Self Attention -> MLP
    '''

    def __init__(self, config: GPTConfig):
        super().__init__()

        self.config = config
        self.transformer = nn.ModuleDict(
            dict(
                wpe = nn.Embedding(config.block_size, config.n_embd),
                wte = nn.Embedding(config.vocab_size, config.n_embd),
                h = nn.ModuleList([DecoderBlock(config) for _ in range(config.n_layer)]),
                ln_f = nn.LayerNorm(config.n_embd)
            )
        )
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size)

        # Weight Trying Scheme
        self.transformer.wte.weight = self.lm_head.weight

        # Initialize the model
        self.apply(self._init_weights)

    def _init_weights(self, module):

        std = 0.02
        if isinstance(module, nn.Linear):

            if hasattr(module, "NANOGPT_SCALE_INIT"):
                std *= (2*self.config.n_layer)**(-0.5)

            torch.nn.init.normal_(module.weight, std = std)

            if module.bias is not None: torch.nn.init.zeros_(module.bias)

        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, std = std)

    def configure_optmizers(self, lr = 6e-04, wd = 1e-01, betas = (0.9, 0.95), eps = 1e-08, device_type = device):
        """

        Args:
          lr:
          wd:
          betas:
          eps:
          device_type:

        Returns:

        """

        # divide into two param groups
        # Assign weight decay to the ones with dim > 1

        decay_params = []
        non_decay_params = []
        for name, param in self.named_parameters():
            if not param.requires_grad: continue
            if param.dim() == 1: non_decay_params.append(param)
            else: decay_params.append(param)

        fused = True if device_type == 'cuda' else False
        optim = torch.optim.AdamW([
                                    {'params': decay_params, "weight_decay": wd},
                                    {'params': non_decay_params, "weight_decay": 0},
                                  ], lr = lr, betas = betas, eps = eps, fused = fused)
        return optim

    def forward(self, idx: torch.tensor, y: torch.tensor = None):
        """

        Args:
          idx:
          y:

        Returns:

        """

        # idx -> B, T
        # 1. Process Embeddings
        # 2. Iterative attention blocks
        # 3. Pass it through lm_head for final layer

        # assert idx.shape[-1] == self.config.block_size
        positions = torch.arange(0, idx.shape[-1], step = 1).to(idx.device)
        pos_embeddings = self.transformer.wpe(positions) # B, T
        tok_embeddings = self.transformer.wte(idx) # B, T, C

        x = pos_embeddings[None, :, :] + tok_embeddings

        for block in self.transformer.h:
            x = block(x)

        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)
        loss = None
        if y is not None:
            loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), y.view(-1))

        return  logits, loss

# Evaluation Snippet

In [5]:
enc = tiktoken.get_encoding('gpt2')
sentence = "Hello, I'm designed to do"
tokens = enc.encode(sentence)
tokens = torch.tensor(tokens, dtype = torch.long)
tokens = tokens.unsqueeze(0).repeat(4, 1)
tokens = tokens.to(device)

model.eval()
with torch.no_grad():
    target_len = 30
    x = tokens
    for i in range(target_len):

        logits = model(x)
        logits = logits[:, -1, :]
        topk_probs, topk_idxs = torch.topk(F.softmax(logits, dim = -1), k = 50)

        ix = torch.multinomial(topk_probs, 1)
        xcol = torch.gather(topk_idxs, -1, ix)
        x = torch.cat([x, ix], dim = 1)

for i in x:
    decoded = enc.decode(i.tolist())
    print(decoded)

NameError: name 'model' is not defined

# Data Set and Data Loader

In [4]:
!wget https://raw.githubusercontent.com/karpathy/build-nanogpt/master/input.txt

--2024-09-21 01:09:25--  https://raw.githubusercontent.com/karpathy/build-nanogpt/master/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.3’


2024-09-21 01:09:25 (84.7 MB/s) - ‘input.txt.3’ saved [1115394/1115394]



In [5]:
class DataLoaderLite:
    def __init__(self, B: int, T: int = 1024):

        with open('./input.txt', 'r') as f: data = f.read()
        tokens = enc.encode(data)
        self.tokens = torch.tensor(tokens, dtype = torch.long)
        self.B = B
        self.T = T
        self.current_position = 0

    def next_batch(self):
        if self.current_position + self.B*self.T > len(self.tokens):
            self.current_position = 0

        offset = self.current_position + self.B*self.T + 1
        if offset > len(self.tokens): offset = len(tokens)

        buf = self.tokens[self.current_position:offset]
        x = buf[:-1].view(self.B, -1)
        y = buf[1:].view(self.B, -1)

        self.current_position += self.B*self.T
        if self.current_position > len(self.tokens): self.current_position = 0

        return x, y


# Testing Activations

In [30]:
config = GPTConfig()
model = GPT(config)
model = model.to(device)
dl = DataLoaderLite(B = 4, T = 64)

In [31]:
x, y = dl.next_batch()
idx = x
positions = torch.arange(0, idx.shape[-1], step = 1).to(idx.device)
pos_embeddings = model.transformer.wpe(positions) # B, T
tok_embeddings = model.transformer.wte(idx) # B, T, C

x = pos_embeddings[None, :, :] + tok_embeddings
print(x.mean(), x.std())
for block in model.transformer.h:
    x = block(x)
    print(x.mean(), x.std())

x = model.transformer.ln_f(x)
print(x.mean(), x.std())
logits = model.lm_head(x)
print(x.mean(), x.std())

tensor(-0.0002, grad_fn=<MeanBackward0>) tensor(0.0283, grad_fn=<StdBackward0>)
tensor(-0.0011, grad_fn=<MeanBackward0>) tensor(0.0807, grad_fn=<StdBackward0>)
tensor(-0.0015, grad_fn=<MeanBackward0>) tensor(0.1140, grad_fn=<StdBackward0>)
tensor(-0.0006, grad_fn=<MeanBackward0>) tensor(0.1441, grad_fn=<StdBackward0>)
tensor(-0.0006, grad_fn=<MeanBackward0>) tensor(0.1683, grad_fn=<StdBackward0>)
tensor(0.0013, grad_fn=<MeanBackward0>) tensor(0.1912, grad_fn=<StdBackward0>)
tensor(0.0025, grad_fn=<MeanBackward0>) tensor(0.2076, grad_fn=<StdBackward0>)
tensor(0.0042, grad_fn=<MeanBackward0>) tensor(0.2256, grad_fn=<StdBackward0>)
tensor(0.0050, grad_fn=<MeanBackward0>) tensor(0.2431, grad_fn=<StdBackward0>)
tensor(0.0014, grad_fn=<MeanBackward0>) tensor(0.2584, grad_fn=<StdBackward0>)
tensor(0.0015, grad_fn=<MeanBackward0>) tensor(0.2763, grad_fn=<StdBackward0>)
tensor(-0.0010, grad_fn=<MeanBackward0>) tensor(0.2919, grad_fn=<StdBackward0>)
tensor(0.0005, grad_fn=<MeanBackward0>) tensor

# Andrej's Code

In [33]:
# just for reference - Do not run.

import sys
# sys.exit(0)

class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1
        # regularization
        self.n_head = config.n_head
        self.n_embd = config.n_embd

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        # nh is "number of heads", hs is "head size", and C (number of channels) = nh * hs
        # e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs=C=768 channels in the Transformer
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        y = F.scaled_dot_product_attention(q, k, v, is_causal=True) # flash attention
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
        # output projection
        y = self.c_proj(y)
        return y

class MLP(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.gelu    = nn.GELU(approximate='tanh')
        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd)
        self.c_proj.NANOGPT_SCALE_INIT = 1

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x

class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x

@dataclass
class GPTConfig:
    block_size: int = 1024 # max sequence length
    vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
    n_layer: int = 12 # number of layers
    n_head: int = 12 # number of heads
    n_embd: int = 768 # embedding dimension

class GPT(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size, config.n_embd),
            wpe = nn.Embedding(config.block_size, config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # weight sharing scheme
        self.transformer.wte.weight = self.lm_head.weight

        # init params
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'NANOGPT_SCALE_INIT'):
                std *= (2 * self.config.n_layer) ** -0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        # idx is of shape (B, T)
        B, T = idx.size()
        assert T <= self.config.block_size, f"Cannot forward sequence of length {T}, block size is only {self.config.block_size}"
        # forward the token and posisition embeddings
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (T, n_embd)
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (B, T, n_embd)
        x = tok_emb + pos_emb
        # forward the blocks of the transformer
        for block in self.transformer.h:
            x = block(x)
        # forward the final layernorm and the classifier
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x) # (B, T, vocab_size)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        return logits, loss


In [34]:
config = GPTConfig()
model = GPT(config)
model = model.to(device)
# dl = DataLoaderLite(B = 4)

In [35]:
x, y = dl.next_batch()
idx = x
positions = torch.arange(0, idx.shape[-1], step = 1).to(idx.device)
pos_embeddings = model.transformer.wpe(positions) # B, T
tok_embeddings = model.transformer.wte(idx) # B, T, C

x = pos_embeddings[None, :, :] + tok_embeddings
print(x.mean(), x.std())
for block in model.transformer.h:
    x = block(x)
    print(x.mean(), x.std())

x = model.transformer.ln_f(x)
print(x.mean(), x.std())
logits = model.lm_head(x)
print(x.mean(), x.std())

tensor(3.7030e-05, grad_fn=<MeanBackward0>) tensor(0.0283, grad_fn=<StdBackward0>)
tensor(0.0010, grad_fn=<MeanBackward0>) tensor(0.0800, grad_fn=<StdBackward0>)
tensor(0.0003, grad_fn=<MeanBackward0>) tensor(0.1148, grad_fn=<StdBackward0>)
tensor(-0.0019, grad_fn=<MeanBackward0>) tensor(0.1408, grad_fn=<StdBackward0>)
tensor(-0.0028, grad_fn=<MeanBackward0>) tensor(0.1654, grad_fn=<StdBackward0>)
tensor(-0.0051, grad_fn=<MeanBackward0>) tensor(0.1873, grad_fn=<StdBackward0>)
tensor(-0.0071, grad_fn=<MeanBackward0>) tensor(0.2055, grad_fn=<StdBackward0>)
tensor(-0.0104, grad_fn=<MeanBackward0>) tensor(0.2224, grad_fn=<StdBackward0>)
tensor(-0.0106, grad_fn=<MeanBackward0>) tensor(0.2385, grad_fn=<StdBackward0>)
tensor(-0.0122, grad_fn=<MeanBackward0>) tensor(0.2589, grad_fn=<StdBackward0>)
tensor(-0.0141, grad_fn=<MeanBackward0>) tensor(0.2735, grad_fn=<StdBackward0>)
tensor(-0.0122, grad_fn=<MeanBackward0>) tensor(0.2882, grad_fn=<StdBackward0>)
tensor(-0.0116, grad_fn=<MeanBackward0>

# Play Ground

In [6]:
if device == 'cuda':
    torch.set_float32_matmul_precision('high')

In [7]:
config = GPTConfig(vocab_size = 50304)
model = GPT(config)
model = model.to(device)

In [8]:
# Sample run on a dataset
# simple training loop for just one batch
iterations = 50
tokens_per_grad_update = 2**19

B = 16; T = 1024
assert tokens_per_grad_update % (B*T) == 0
grad_accumulation_steps = int(tokens_per_grad_update/(B*T))
print(f"Gradiant Accumulation Steps: {grad_accumulation_steps}")

# opt = torch.optim.AdamW(model.parameters(), lr = 6e-04, betas = (0.9, 0.95), eps = 1e-08)

if device == 'cuda': model = torch.compile(model)

Gradiant Accumulation Steps: 32


In [11]:

opt = model.configure_optmizers(lr = 6e-04, wd = 1e-01, betas = (0.9, 0.95), eps = 1e-08, device_type = device)
lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(opt, max_lr = 6e-04, total_steps = iterations, final_div_factor=10.0)
dl = DataLoaderLite(B = B, T = T)

for i in range(iterations):
    opt.zero_grad()
    accumulated_loss = 0.

    t1 = time.time()
    for j in range(grad_accumulation_steps):
        x, y = dl.next_batch()
        x, y = x.to(device), y.to(device)

        if device == 'cuda':
            with torch.autocast(device_type = device, dtype = torch.bfloat16):
                logits, loss = model(x, y)
        else:
            logits, loss = model(x, y)

        loss /= grad_accumulation_steps
        accumulated_loss += loss.item()
        loss.backward()

    if device == 'cuda': torch.cuda.synchronize()
    t2 = time.time()
    elapsed_time = t2 - t1
    tps = B*T*grad_accumulation_steps/elapsed_time

    norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    opt.step()
    lr_scheduler.step()
    print(f'Iteration: {i} | Loss: {accumulated_loss:.5f} | norm: {norm:.5f} | time: {(elapsed_time/grad_accumulation_steps):.4f} | tps: {tps:.3f}')

Iteration: 0 | Loss: 6.05822 | norm: 0.50192 | time: 0.4632 | tps: 35374.812
Iteration: 1 | Loss: 6.04270 | norm: 0.60625 | time: 0.4650 | tps: 35231.252
Iteration: 2 | Loss: 6.04704 | norm: 0.57237 | time: 0.4582 | tps: 35758.706
Iteration: 3 | Loss: 6.02490 | norm: 0.90870 | time: 0.4522 | tps: 36233.069
Iteration: 4 | Loss: 6.15856 | norm: 2.79147 | time: 0.4486 | tps: 36522.788
Iteration: 5 | Loss: 6.01018 | norm: 1.31635 | time: 0.4486 | tps: 36522.172
Iteration: 6 | Loss: 6.04222 | norm: 2.59953 | time: 0.4528 | tps: 36181.156
Iteration: 7 | Loss: 6.14822 | norm: 2.43822 | time: 0.4555 | tps: 35967.000
Iteration: 8 | Loss: 6.03713 | norm: 1.77703 | time: 0.4558 | tps: 35947.154
Iteration: 9 | Loss: 5.97217 | norm: 1.76285 | time: 0.4558 | tps: 35948.685
Iteration: 10 | Loss: 5.95799 | norm: 1.78912 | time: 0.4538 | tps: 36107.264
Iteration: 11 | Loss: 5.90646 | norm: 0.90164 | time: 0.4529 | tps: 36172.805
Iteration: 12 | Loss: 5.90789 | norm: 0.81700 | time: 0.4527 | tps: 36192.

In [10]:
# Without torch.compile
# Sample run on a dataset
# simple training loop for just one batch
config = GPTConfig(vocab_size = 50304)
model = GPT(config)
model = model.to(device)

iterations = 50
tokens_per_grad_update = 2**19

B = 16; T = 1024
assert tokens_per_grad_update % (B*T) == 0
grad_accumulation_steps = int(tokens_per_grad_update/(B*T))
print(f"Gradiant Accumulation Steps: {grad_accumulation_steps}")

# opt = torch.optim.AdamW(model.parameters(), lr = 6e-04, betas = (0.9, 0.95), eps = 1e-08)

# if device == 'cuda': model = torch.compile(model)

opt = model.configure_optmizers(lr = 6e-04, wd = 1e-01, betas = (0.9, 0.95), eps = 1e-08, device_type = device)
lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(opt, max_lr = 6e-04, total_steps = iterations, final_div_factor=10.0)
dl = DataLoaderLite(B = B, T = T)

for i in range(iterations):
    opt.zero_grad()
    accumulated_loss = 0.

    t1 = time.time()
    for j in range(grad_accumulation_steps):
        x, y = dl.next_batch()
        x, y = x.to(device), y.to(device)

        if device == 'cuda':
            with torch.autocast(device_type = device, dtype = torch.bfloat16):
                logits, loss = model(x, y)
        else:
            logits, loss = model(x, y)

        loss /= grad_accumulation_steps
        accumulated_loss += loss.item()
        loss.backward()

    if device == 'cuda': torch.cuda.synchronize()
    t2 = time.time()
    elapsed_time = t2 - t1
    tps = B*T*grad_accumulation_steps/elapsed_time

    norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    opt.step()
    lr_scheduler.step()
    print(f'Iteration: {i} | Loss: {accumulated_loss:.5f} | norm: {norm:.5f} | time: {(elapsed_time/grad_accumulation_steps):.4f} | tps: {tps:.3f}')

Gradiant Accumulation Steps: 32
Iteration: 0 | Loss: 10.95100 | norm: 25.79529 | time: 0.5580 | tps: 29360.654
Iteration: 1 | Loss: 9.78726 | norm: 11.57891 | time: 0.5275 | tps: 31059.412
Iteration: 2 | Loss: 9.34380 | norm: 7.52412 | time: 0.5224 | tps: 31361.968
Iteration: 3 | Loss: 9.36433 | norm: 11.63742 | time: 0.5188 | tps: 31578.372
Iteration: 4 | Loss: 8.97734 | norm: 3.80904 | time: 0.5195 | tps: 31538.441
Iteration: 5 | Loss: 9.09808 | norm: 10.48295 | time: 0.5202 | tps: 31497.045
Iteration: 6 | Loss: 8.76523 | norm: 3.29494 | time: 0.5196 | tps: 31533.860
Iteration: 7 | Loss: 8.58056 | norm: 2.82115 | time: 0.5202 | tps: 31495.291
Iteration: 8 | Loss: 8.31629 | norm: 2.22433 | time: 0.5199 | tps: 31510.989
Iteration: 9 | Loss: 8.07400 | norm: 3.08333 | time: 0.5194 | tps: 31541.356
Iteration: 10 | Loss: 7.78656 | norm: 2.92010 | time: 0.5196 | tps: 31534.794
Iteration: 11 | Loss: 7.46839 | norm: 2.42303 | time: 0.5195 | tps: 31539.955
Iteration: 12 | Loss: 7.15307 | norm:

In [19]:
x = x[:, :1000]

In [23]:
torch.save(model, './initial_run.pt')

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [20]:
model.eval()
with torch.no_grad():
    target_len = 5
    for i in range(target_len):
        print(i)
        logits, _ = model(x)
        logits = logits[:, -1, :]
        topk_probs, topk_idxs = torch.topk(F.softmax(logits, dim = -1), k = 50)

        ix = torch.multinomial(topk_probs, 1)
        xcol = torch.gather(topk_idxs, -1, ix)
        x = torch.cat([x, ix], dim = 1)

for i in x:
    decoded = enc.decode(i.tolist())
    print(decoded[-6:])

0


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [21]:
x.shape, x.device

(torch.Size([16, 1000]), device(type='cuda', index=0))

In [9]:
model = None
torch.cuda.empty_cache()