
## GPT from scratch in PyTorch


In [7]:

import torch
import numpy as np
import torch.nn as nn

from torch.nn import functional as F


In [8]:

torch.manual_seed(256)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

block_size        = 40      ## N tokens in sequence
batch_size        = 64 
max_iters         = 6000
eval_interval     = 500     
learning_rate     = 0.0003
eval_iters        = 300
vocab_size        = 88  ## 65

## every id for a given token is embedded to vector of this size
n_embd            = 512                  
n_head            = 8         ## 8 attention heads
n_layer           = 6         ## 6 eoncoder layers
dropout           = 0.2


In [9]:

text = ''

input_file2 = 'HuckFinn.txt'

with open(input_file2, 'r', encoding='utf-8') as f:
    text = f.read()


In [10]:

print("length of data in letter or characters")
len(text)




length of data in letter or characters


590407

In [11]:

list(set(text))


['l',
 'K',
 'p',
 '5',
 'Q',
 'F',
 'V',
 'A',
 '_',
 'E',
 'k',
 '—',
 'c',
 'D',
 'R',
 '1',
 'z',
 '”',
 'T',
 't',
 's',
 '4',
 'H',
 'L',
 '\ufeff',
 '8',
 '?',
 'v',
 '3',
 'h',
 '2',
 'W',
 'x',
 '*',
 '(',
 '“',
 'y',
 'm',
 '#',
 '•',
 'r',
 'q',
 '$',
 '7',
 ':',
 '9',
 'J',
 ')',
 ']',
 'b',
 '™',
 '\n',
 'j',
 '%',
 'n',
 'g',
 '0',
 '/',
 '‘',
 '’',
 ';',
 'X',
 'u',
 'S',
 ' ',
 'w',
 'G',
 'C',
 '-',
 'O',
 '6',
 ',',
 '[',
 '.',
 'i',
 'a',
 '!',
 'e',
 'M',
 'f',
 'I',
 'Y',
 'o',
 'N',
 'd',
 'B',
 'U',
 'P']

In [12]:

the_chars  = sorted(     list(set(text))     )

vocab_size = len( the_chars )      ## 65

print(  len(the_chars)  )

print(  ''.join(the_chars)  )

## The printed oputput
## !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz



88

 !#$%()*,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXY[]_abcdefghijklmnopqrstuvwxyz—‘’“”•™﻿


In [13]:

stoi = { ch:i for i, ch in enumerate(the_chars) }
itos = { i:ch for i, ch in enumerate(the_chars) }


In [14]:

print( stoi )
print( itos )


{'\n': 0, ' ': 1, '!': 2, '#': 3, '$': 4, '%': 5, '(': 6, ')': 7, '*': 8, ',': 9, '-': 10, '.': 11, '/': 12, '0': 13, '1': 14, '2': 15, '3': 16, '4': 17, '5': 18, '6': 19, '7': 20, '8': 21, '9': 22, ':': 23, ';': 24, '?': 25, 'A': 26, 'B': 27, 'C': 28, 'D': 29, 'E': 30, 'F': 31, 'G': 32, 'H': 33, 'I': 34, 'J': 35, 'K': 36, 'L': 37, 'M': 38, 'N': 39, 'O': 40, 'P': 41, 'Q': 42, 'R': 43, 'S': 44, 'T': 45, 'U': 46, 'V': 47, 'W': 48, 'X': 49, 'Y': 50, '[': 51, ']': 52, '_': 53, 'a': 54, 'b': 55, 'c': 56, 'd': 57, 'e': 58, 'f': 59, 'g': 60, 'h': 61, 'i': 62, 'j': 63, 'k': 64, 'l': 65, 'm': 66, 'n': 67, 'o': 68, 'p': 69, 'q': 70, 'r': 71, 's': 72, 't': 73, 'u': 74, 'v': 75, 'w': 76, 'x': 77, 'y': 78, 'z': 79, '—': 80, '‘': 81, '’': 82, '“': 83, '”': 84, '•': 85, '™': 86, '\ufeff': 87}
{0: '\n', 1: ' ', 2: '!', 3: '#', 4: '$', 5: '%', 6: '(', 7: ')', 8: '*', 9: ',', 10: '-', 11: '.', 12: '/', 13: '0', 14: '1', 15: '2', 16: '3', 17: '4', 18: '5', 19: '6', 20: '7', 21: '8', 22: '9', 23: ':', 24:

In [16]:

encode = lambda s: [ stoi[c]          for c in s   ] 

encode("bahh")


[55, 54, 61, 61]

In [17]:

decode = lambda l: ''.join(   itos[i] for i in l   )    

decode([55, 54, 61, 61])



'bahh'

In [18]:

data = torch.tensor(   encode(text), dtype=torch.long   )

print( data )


tensor([87, 45, 61,  ...,  0,  0,  0])


In [19]:

n          = int(   0.9*len(data)   )

train_data = data[:n]
val_data   = data[n:]


In [20]:

def get_batch(split):
    if split == "train":
        data = train_data
    else:
        data = val_data
        
    ix = torch.randint(   len(data) - block_size, (batch_size,)   )
    
    x  = torch.stack(    [  data[   i : i+block_size ]     for i in ix ]    ) 
    y  = torch.stack(    [  data[ i+1 : i+1+block_size ]   for i in ix ]    )
    
    x, y = x.to(device), y.to(device)

    return x, y


In [21]:

temp_batch_size = 4
temp_block_size = 16

## select random starting points for the 4 sentences
ix = torch.randint(   
            len(data) - block_size, 
            (temp_batch_size,)   
)

print( ix )


tensor([ 47693, 180144,  44367,  30762])


In [22]:

for index_temp in ix:
    print(  data[index_temp]  )



tensor(1)
tensor(59)
tensor(1)
tensor(1)


In [23]:

x  = torch.stack(    
    [ data[   i : i+  temp_block_size ]   for i in ix ] 
    
) 

y  = torch.stack(    
    [ data[ i+1 : i+1+ temp_block_size ]  for i in ix ]    
)

print(x)
print(y)



tensor([[ 1, 45, 61, 54, 73, 56, 61, 58, 71, 24,  1, 61, 58, 82, 65, 65],
        [59, 54, 71, 66,  1, 56, 65, 68, 72, 58,  1, 73, 68,  0, 76, 61],
        [ 1, 54,  0, 59, 62, 72, 61, 10, 55, 58, 65, 65, 78,  1, 76, 61],
        [ 1, 73, 76, 58, 65, 75, 58,  1, 66, 62, 65, 58,  1, 54, 55, 68]])
tensor([[45, 61, 54, 73, 56, 61, 58, 71, 24,  1, 61, 58, 82, 65, 65,  1],
        [54, 71, 66,  1, 56, 65, 68, 72, 58,  1, 73, 68,  0, 76, 61, 58],
        [54,  0, 59, 62, 72, 61, 10, 55, 58, 65, 65, 78,  1, 76, 61, 62],
        [73, 76, 58, 65, 75, 58,  1, 66, 62, 65, 58,  1, 54, 55, 68, 75]])


In [24]:

@torch.no_grad()    ## for efficient processing
def estimate_loss():
    out = {}
    model.eval()   ## set to no training
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()  ## back to training
    return out




## NN Architectures


In [25]:

class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        
        self.key   = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]
        self.query = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]
        self.value = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]

        tril_def = torch.tril( torch.ones(block_size, block_size) )  ## [40, 40]
        
        self.register_buffer(
                  'tril', 
                  tril_def
               )
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        
        B, T, E = x.shape   ## [batch_size, 40, 512]
        
        k = self.key(   x )            ## k = (B, T, 64)
        q = self.query( x )            ## q = (B, T, 64)

        E2 = 64     ## I think this is 64 and not 512
        ## (B, T, E) @ (B, E, T)  -> (B, T, T)
        wei = q @ k.transpose(-2, -1) * E2 ** -0.5        
        
        wei = wei.masked_fill(
                      self.tril[:T, :T] == 0, 
                      float('-inf')
        )   
        
        ## (B, T, T)
        wei = F.softmax( wei, dim= -1 )         ## (B, T, T)
        wei = self.dropout(   wei   )
        
        ## perform weighted aggregation of values
        
        v   = self.value(  x  )   ## x = (B, 40, E)
        out = wei @ v             ## (B, T, T) @ (B, T, 64) -> (B, T, 64)
        
        return out
        


In [26]:


class FeedForward(nn.Module):

    def __init__(self, n_embd):         ## 512
        
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),      ## [512, 4*512]
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),      ## [4*512, 512]
            nn.Dropout(dropout),
        )
        
    def forward(self, x):
        return self.net(x)


In [27]:

class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, head_size):    ## (8, 64)
        super().__init__()
        self.heads = nn.ModuleList(  [ Head(head_size) for _ in range(num_heads) ] )
        self.proj  = nn.Linear(n_embd, n_embd)   ## 512, 512
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        out = torch.cat(   [ h(x) for h in self.heads ], dim = -1   )
        out = self.proj(  out   )
        out = self.dropout(   out   )
        return out



In [28]:

class Block(nn.Module):
    
    def __init__(self, n_embd, n_head):     ## (512, 8)
        super().__init__()
        head_size = n_embd // n_head        ## 64
        self.sa   = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward( n_embd)    ## 512
        self.ln1  = nn.LayerNorm(n_embd)
        self.ln2  = nn.LayerNorm(n_embd)
        
    def forward(self, x):
        x = x + self.sa(     self.ln1(x)      )
        x = x + self.ffwd(   self.ln2(x)      )
        return x


In [29]:

class GPTModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)   ## [65, 512]
        self.pos_emb_table = nn.Embedding(block_size, n_embd)     ## [block, 512]
        
        self.blocks = nn.Sequential(
                *[   Block(n_embd, n_head=n_head) for _ in range(n_layer)    ]
        )
        
        self.ln_f    = nn.LayerNorm(  n_embd    )        
        self.lm_ffw_head = nn.Linear(n_embd, vocab_size)  ## [512, 65] # FFW Layer
        
    def forward(self, idx, targets=None):
        B, T = idx.shape     ## (Batch, 40)
        ## ids and targets are both (B, T) tensors of integers
        
        tok_emb = self.token_embedding_table(idx)      
        pos_emb = self.pos_emb_table(torch.arange(T, device=device))  
        
        x = tok_emb + pos_emb    ## [B, T, E] or [64, 40, 512]

        ## This is the architecture
        x = self.blocks(  x  )   ## (B, T, E)        
        x = self.ln_f(    x  )   ## (B, T, E)   ## norm
        logits = self.lm_ffw_head(x)         ## [B, 40, 65] 
        
        if targets is None:
            loss = None
        else:
            B, T, E  = logits.shape
            logits  = logits.view( B*T, E)
            targets = targets.view(B*T)
            loss    = F.cross_entropy(logits, targets)
        return logits, loss
        
    def generate(self, idx, max_new_tokens):    ## idx is (B, T)
        for _ in range(max_new_tokens):
            ## crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond)    ## ## get preds
            logits = logits[:, -1, :]    ## focus on last one (B, E)
            probs = F.softmax(logits, dim= -1)    ## (B, E) get probs
            idx_next = torch.multinomial(probs, num_samples=1)     ## (B, 1) selected
            idx = torch.cat(  (idx, idx_next), dim=1  )   ## (B, T+1) append sample to running sequence
        return idx
            


In [30]:

model   = GPTModel()

m       = model.to(device)

optimizer = torch.optim.Adam(  m.parameters(), lr=learning_rate   )



In [31]:


for iter in range(max_iters):
    
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')
    
    ## eval the loss
    logits, loss = m(xb, yb)
    
    optimizer.zero_grad(set_to_none=True)   ## zero out
    loss.backward()
    optimizer.step()


step 0: train loss 4.6873, val loss 4.6941
step 500: train loss 1.6325, val loss 1.9573
step 1000: train loss 1.4677, val loss 1.8467
step 1500: train loss 1.3884, val loss 1.7996
step 2000: train loss 1.3267, val loss 1.7387
step 2500: train loss 1.2829, val loss 1.7265
step 3000: train loss 1.2525, val loss 1.7277
step 3500: train loss 1.2142, val loss 1.7051
step 4000: train loss 1.1874, val loss 1.7138
step 4500: train loss 1.1594, val loss 1.7078
step 5000: train loss 1.1324, val loss 1.7187
step 5500: train loss 1.1071, val loss 1.7381


In [51]:


## Starting token  id_sos = 0
sos_context = torch.zeros(  (1, 1),  dtype=torch.long, device=device   )   

generated_text = m.generate(sos_context, max_new_tokens=500)[0].tolist()

print(  decode(generated_text)   )




seemed that man come out Silas he warn’t one more
her money by ching, just the shed and come and
mad” the young was smarking; and it warn’t no lember ’bout
bein nigger.”

And then he said times to call me about whether I was on
them farms around. They knowed it was the more with a cow, and blamed off and kept stole, and
two mile below a treathen two-mile below and allowed on a whole
whirteen dollar en dem
genies you going?”

“Oh, I won’t.”

“Well, I tell you _you_ in the bag rip for about?”

So 


In [52]:

sos_context = torch.ones(  (1, 1),  dtype=torch.long, device=device   )   

generated_text = m.generate(sos_context, max_new_tokens=500)[0].tolist()

print(  decode(generated_text)   )


 conside; and all count of thing their Man
handker in it. And nahead of my bedug and glad him; hained and made me
fell around she was full of thing that we
can; but when the ridge of the boot, and make him better. Well, the brush end down and struck out in a _cowsuns_, too; never got
tongueal day just all the tree or heaps of papictle.

I went for it; and they see him. I never see anything cry, and got him all madiful; and Jim said this
pap, they wasn’t curied no ways him. Peter we’ve a rob’long 


In [53]:

new_lst = encode("Where is Huck?")


In [54]:

new_np = np.array(  new_lst   )
new_np


array([48, 61, 58, 71, 58,  1, 62, 72,  1, 33, 74, 56, 64, 25])

In [55]:

new_context = torch.tensor(new_np, dtype=torch.long, device=device )


new_context = new_context.view( (1, -1))
new_context 


tensor([[48, 61, 58, 71, 58,  1, 62, 72,  1, 33, 74, 56, 64, 25]],
       device='cuda:0')

In [59]:

generated_text = m.generate(new_context, max_new_tokens=500)[0].tolist()

print(  decode(generated_text)   )


Where is Huck? Why, _dot_ you know?” so I know—that didn’t want to kind of
it, for anything along, then I reckoned I
went inquire; and it warn’t no all
right out there.”

“Well, you ain’t generly.”

And at last.”

Marry sand she stay after lip in the idea,
Duke Buck LEBEKROD
THE MXL
HE THE ALVINN

THE HOKESTAHEPTER IV.

Thinks Aunt Sally streaking over the island,
and says:

“If I has pote ’bout two worse time—nife does a collect’ that way; but I didn’t wished to mind I used to their head down room begancinte



## Figuring out dimensions


In [60]:

new_context.shape


torch.Size([1, 14])

In [61]:

sos_context_tmp = torch.ones(  (1, 1),  dtype=torch.long, device=device   ) 
sos_context_tmp.shape


torch.Size([1, 1])