In [15]:
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F 
import math
import time

In [16]:
@dataclass
class GPTConfig:
    block_size: int =1024
    vocab_size: int =50304
    n_layer: int    =12
    n_head:  int    =12
    n_embd: int     =768

In [17]:
class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output projection
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        # regularization
        #self.attn_dropout = nn.Dropout(config.dropout)
        #self.resid_dropout = nn.Dropout(config.dropout)
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        #self.dropout = config.dropout
        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
        
     
            # causal mask to ensure that attention is only applied to the left in the input sequence
        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
                                        .view(1, 1, config.block_size, config.block_size))

    def forward(self, x):
        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)

        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
        qkv = self.c_attn(x)
        q, k, v  = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
 
             # efficient attention using Flash Attention CUDA kernels
        y = torch.nn.functional.scaled_dot_product_attention(q, k, v, is_causal=True)
        
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

        # output projection
        y = self.c_proj(y)
        return y


In [18]:
class MLP(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4*config.n_embd)
        self.gelu = nn.GELU(approximate = "tanh")
        self.c_proj = nn.Linear(4*config.n_embd,config.n_embd)
        self.c_proj. NANOGPT_SCALE_INIT =1
    def forward(self,x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x
        

In [19]:
class Block(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.ln_1  = nn.LayerNorm(config.n_embd)
        self.attn  = CausalSelfAttention(config)
        self.ln_2  = nn.LayerNorm(config.n_embd)
        self.mlp   = MLP(config)
    def forward(self,x):
         x = x + self.attn(self.ln_1(x))
         x = x + self.mlp(self.ln_2(x))
         return x

In [20]:
class GPT(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.config = config
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size,config.n_embd),
            wpe = nn.Embedding(config.block_size,config.n_embd),
            h   = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))
        self.lm_head = nn.Linear(config.n_embd,config.vocab_size,bias= False)
        #weight sharing
        self.transformer.wte.weight = self.lm_head.weight
        self.apply(self._init_weights)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module,"NANOGPT_SCALE_INIT"):
                std *= (2*self.config.n_layer)**-0.5
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    def forward(self, idx, targets=None):
        #device = idx.device
        B, T = idx.size()
        #assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (t)

        # forward the GPT model itself
        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
        x = tok_emb + pos_emb
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)
        loss = None
     
       
            # inference-time mini-optimization: only forward the lm_head on the very last position
        logits = self.lm_head(x) # note: using list [-1] to preserve the time dim
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1,logits.size(-1)),targets.view(-1))

    

        return logits,loss
    @classmethod
    def from_pretrained(cls, model_type, override_args=None):
        # assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
        # override_args = override_args or {} # default to empty dict
        # # only dropout can be overridden see more notes below
        # assert all(k == 'dropout' for k in override_args)
        from transformers import GPT2LMHeadModel
        print("loading weights from pretrained gpt: %s" % model_type)

        # n_layer, n_head and n_embd are determined from model_type
        config_args = {
            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
        }[model_type]
        print("forcing vocab_size=50257, block_size=1024, bias=True")
        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
        #config_args['bias'] = True # always True for GPT model checkpoints
        # we can override the dropout rate, if desired
        # if 'dropout' in override_args:
        #     print(f"overriding dropout rate to {override_args['dropout']}")
        #     config_args['dropout'] = override_args['dropout']
        # create a from-scratch initialized minGPT model
        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param

        # init a huggingface/transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()

        # copy while ensuring all of the parameters are aligned and match in names and shapes
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
        # this means that we have to transpose these weights when we import them
        print(len(sd_keys_hf),len(sd_keys))
      
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # special treatment for the Conv1D weights we need to transpose
                #assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
            else:
                # vanilla copy over the other parameters
                #assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])

        return model
        
device = "cuda"

In [21]:
import tiktoken

class DataLoaderLite():
    def __init__(self,B,T):
        self.B = B
        self.T = T
        enc = tiktoken.get_encoding("gpt2")
        with open("input.txt","r") as f:
            text = f.read()
        tokens = enc.encode(text)
        self.tokens =  torch.tensor(tokens)
        print(f"loaded {len(self.tokens)} tokens")
        print(f" 1 epoch = {len(self.tokens)//(B*T)} batches")
        self.current_position = 0
    def next_batch(self):
        B,T = self.B,self.T
        buf = self.tokens[self.current_position : self.current_position+B*T +1]
        #print(self.current_position,self.current_position+B*T +1)
        x = buf[:-1].view(B,T)
        y = buf[1:].view(B,T)
        self.current_position += B*T
        if self.current_position + (B*T + 1)>len(self.tokens):
            self.current_position = 0
        return x,y



In [22]:
338025/(2640*32)

4.0012428977272725

In [23]:
train_loader = DataLoaderLite(B=4,T=1024)

loaded 338025 tokens
 1 epoch = 82 batches


In [24]:
# model = GPT(GPTConfig)
torch.set_float32_matmul_precision("high")
model = GPT(GPTConfig)

In [25]:

model.to(device)
#model = torch.compile(model)


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 768)
    (wpe): Embedding(1024, 768)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50304, bias=False)
)

In [26]:
#logits , loss = model(x,y)
optimizer = torch.optim.AdamW(model.parameters(),lr=3e-4)



In [27]:
import time

In [28]:
for i in range(50):
    t0 = time.time()
    x,y = train_loader.next_batch()
    x,y = x.to(device),y.to(device)
    optimizer.zero_grad()
    with torch.autocast(device_type=device,dtype=torch.bfloat16):
         
            logits,loss = model(x,y)
    loss.backward()
    optimizer.step()
    torch.cuda.synchronize()
    t1 = time.time()
    dt = (t1-t0)*1000
    tokens_per_second = (train_loader.B*train_loader.T)/(t1-t0)
    print(f"step [i] ,loss:{loss.item()}, dt : {dt:.2f} ms , tok/sec :{tokens_per_second :.2f}")

step [i] ,loss:10.910934448242188, dt : 502.24 ms , tok/sec :8155.41
step [i] ,loss:9.72317886352539, dt : 532.88 ms , tok/sec :7686.50
step [i] ,loss:9.504365921020508, dt : 440.21 ms , tok/sec :9304.70
step [i] ,loss:8.964580535888672, dt : 438.67 ms , tok/sec :9337.39
step [i] ,loss:8.804107666015625, dt : 439.14 ms , tok/sec :9327.22
step [i] ,loss:8.464309692382812, dt : 440.84 ms , tok/sec :9291.38
step [i] ,loss:8.237056732177734, dt : 441.20 ms , tok/sec :9283.85
step [i] ,loss:7.9466705322265625, dt : 442.74 ms , tok/sec :9251.39
step [i] ,loss:7.808586120605469, dt : 441.15 ms , tok/sec :9284.76
step [i] ,loss:7.554592132568359, dt : 439.95 ms , tok/sec :9310.09
step [i] ,loss:7.5595550537109375, dt : 440.29 ms , tok/sec :9302.86
step [i] ,loss:7.567497253417969, dt : 441.71 ms , tok/sec :9273.07
step [i] ,loss:7.602851867675781, dt : 441.12 ms , tok/sec :9285.37
step [i] ,loss:7.487571716308594, dt : 439.59 ms , tok/sec :9317.84
step [i] ,loss:7.145847320556641, dt : 439.21 

In [29]:
print(loss)

tensor(6.4409, device='cuda:0', grad_fn=<NllLossBackward0>)


In [16]:
num_return_sequences = 5
max_length = 30

In [17]:
import tiktoken
enc = tiktoken.get_encoding("gpt2")
tokens = enc.encode("Hello, I'm a language model,")
tokens = torch.tensor(tokens,dtype  = torch.long)
print(tokens.shape)
print(num_return_sequences)
tokens = tokens.unsqueeze(0).repeat(num_return_sequences,1)
x = tokens.to("cuda")

torch.Size([8])
5


In [18]:

torch.manual_seed(42)
torch.cuda.manual_seed(42)

In [19]:
import math

In [24]:

while x.size(1)<max_length:            # if the sequence context is growing too long we must crop it at block_size
    #idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
    # forward the model to get the logits for the index in the sequence
    with torch.no_grad():
        logits =  model(x)
        # pluck the logits at the final step and scale by desired temperature
   
        logits = logits[:, -1, :] 
        # optionally crop the logits to only the top k options
        probs = F.softmax(logits, dim=-1)
        topk_probs,topk_indices = torch.topk(probs, 50,dim=-1)
        #print(topk_probs)
        # apply softmax to convert logits to (normalized) probabilities
       
        # sample from the distribution
        ix = torch.multinomial(topk_probs,1)
        # append sampled index to the running sequence and continue
        xcol = torch.gather(topk_indices,-1,ix)
        x = torch.cat((x, xcol), dim=1)
    
       

In [25]:
for i in range(num_return_sequences):
    
    print(x[i,:max_length])
    tokens = list(x[i,:max_length])
    decoded = enc.decode(tokens)
    print(">",decoded)

tensor([15496,    11,   314,  1101,   257,  3303,  2746,    11,   407,   257,
         1430,    13,   198,   198,  2396,   428,  3329,   314,  2067, 11065,
          329,   262,  2720,   287,   262,  2248,    13,   770,   373,   407],
       device='cuda:0')
> Hello, I'm a language model, not a program.

So this morning I started studying for the interview in the lab. This was not
tensor([15496,    11,   314,  1101,   257,  3303,  2746,    11,   290,   530,
          286,   262,  1388,  1243,   326, 46293,   502,   618,   484,  2251,
         8950,   318,   703,  2562,   340,  4329,   284,  2251,  1223,   326],
       device='cuda:0')
> Hello, I'm a language model, and one of the main things that bothers me when they create languages is how easy it becomes to create something that
tensor([15496,    11,   314,  1101,   257,  3303,  2746,    11,   290,   314,
         2630,   340,   572,   319,   262,  9384,   326,   257,  3303,  2746,
          561,   787,   502,   517, 43472,    13,   