# Building a GPT from scratch - pre-training, fine-tuning, RAGs

# Source file is available at https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [172]:
import torch
import numpy as np
import time

In [129]:
with open('input.txt', 'r', encoding='utf-8') as f:
    shakespeare_text = f.read()
print(f'Number of characters = {len(text)} \n ************************ \n First 100 characters are: \n ************************ \n {text[:100]}')

Number of characters = 1115394 
 ************************ 
 First 100 characters are: 
 ************************ 
 First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


# We will build a simple transformer on character level vocabulary. We will do the following:
1) build a small vocabulary,
2) test an example,
3) encode our text,
4) split it into train and test

In [130]:
vocab = sorted(list(set(text)))
vocab_size = len(vocab)
print(''.join(vocab), "\nvocab size = ", vocab_size)
encode_mapping = {ch:i for i, ch in enumerate(vocab)}
decode_mapping = {i:ch for i, ch in enumerate(vocab)}


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz 
vocab size =  65


In [156]:
def encode(text):
    encode_lambda = lambda s: [encode_mapping[ch] for ch in s]
    return torch.tensor(encode_lambda(text))

def decode(coded_sentence_tensor):
    decode_lambda = lambda s: ''.join([decode_mapping[ch] for ch in s])
    return decode_lambda(coded_sentence_tensor.numpy())

In [158]:
example = encode('hii there')
example

tensor([46, 47, 47,  1, 58, 46, 43, 56, 43])

In [159]:
decode(example)

'hii there'

In [134]:
### Code the shakespeare text ###

data = encode(shakespeare_text)
print(f"len(data) = {len(data)}")
print(data[:100], shakespeare_text[:100])

len(data) = 1115394
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59]) First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [135]:
### Split data into train and test ###

train = data[:int(0.9*len(data))]
test = data[int(0.9*len(data)):]
print(f"train size = {len(train)}, and test size = {len(test)}")

train size = 1003854, and test size = 111540


# For a transformer, we need to split the data into batches, where each back contains a block of numbers. 
## Block size = 8. 
## batch size = number of blocks that will be processed in parallel

Each block of 8 consists of 8 training examples for the transformer. Let's see how with an example.

In [136]:
block_size = 8
batch_size = 4

In [137]:
x = data[:block_size]
y = data[1:block_size+1]

for index in range(block_size):
    print(f"when training on {x[:index+1]}, target is {y[index]}")

when training on tensor([18]), target is 47
when training on tensor([18, 47]), target is 56
when training on tensor([18, 47, 56]), target is 57
when training on tensor([18, 47, 56, 57]), target is 58
when training on tensor([18, 47, 56, 57, 58]), target is 1
when training on tensor([18, 47, 56, 57, 58,  1]), target is 15
when training on tensor([18, 47, 56, 57, 58,  1, 15]), target is 47
when training on tensor([18, 47, 56, 57, 58,  1, 15, 47]), target is 58


In [138]:
torch.manual_seed(1337)
def get_batch(split):
    data = train if split == "train" else test
    random_indexes = torch.randint(len(data)-block_size, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in random_indexes])
    y = torch.stack([data[i+1:i+block_size+1] for i in random_indexes])
    return x, y

xb, yb = get_batch('train')
print("inputs = ", xb)
print("outputs = ", yb)
print(f"Input shape = {xb.shape}")
print(f"Output shape = {yb.shape}")

inputs =  tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
outputs =  tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
Input shape = torch.Size([4, 8])
Output shape = torch.Size([4, 8])


In [139]:
for i in range(batch_size):
    for j in range(block_size):
        context = xb[i, :j+1]
        target = yb[i, j]
        print(f"When input = {context.tolist()}, target = {target}")

When input = [24], target = 43
When input = [24, 43], target = 58
When input = [24, 43, 58], target = 5
When input = [24, 43, 58, 5], target = 57
When input = [24, 43, 58, 5, 57], target = 1
When input = [24, 43, 58, 5, 57, 1], target = 46
When input = [24, 43, 58, 5, 57, 1, 46], target = 43
When input = [24, 43, 58, 5, 57, 1, 46, 43], target = 39
When input = [44], target = 53
When input = [44, 53], target = 56
When input = [44, 53, 56], target = 1
When input = [44, 53, 56, 1], target = 58
When input = [44, 53, 56, 1, 58], target = 46
When input = [44, 53, 56, 1, 58, 46], target = 39
When input = [44, 53, 56, 1, 58, 46, 39], target = 58
When input = [44, 53, 56, 1, 58, 46, 39, 58], target = 1
When input = [52], target = 58
When input = [52, 58], target = 1
When input = [52, 58, 1], target = 58
When input = [52, 58, 1, 58], target = 46
When input = [52, 58, 1, 58, 46], target = 39
When input = [52, 58, 1, 58, 46, 39], target = 58
When input = [52, 58, 1, 58, 46, 39, 58], target = 1
Whe

# We first get a stupid model where the tokens in a block do not talk to each other. The next character prediction is based out of a look up table, not from attention mechanism

In [140]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

<torch._C.Generator at 0x10bd218b0>

In [141]:
#  Class structure taken from https://pytorch.org/docs/stable/generated/torch.nn.Module.html

# We need to 
# 1) Create a forward pass, i.e., define an embedding table, a single node network that simply looks up the embedding of the previous token, gets the value, creates the logit, and gets loss function due to logit (since we know the actual output). 
# 2) Create a decoder pass that generates the next output using the forward pass
# Note that we don't need to create the logit and the loss function. We do this so that when we improve the model, we can compute the loss and see it drop.

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, target=None):
        # idx is the current context
        # target is the next predicted character
        # Due to parallel processing, idx and targets are both (batch_size, block_size) tensor of integers
        B, T = idx.shape
        logits = self.token_embedding_table(idx) # logits is of size (batch_size, block_size, vocab_size) = (B, T, C)
        
        if target == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # 32 x 65
            target = target.view(B*T)
            loss = F.cross_entropy(logits, target) 
            # Pytorch will automatically change target to (B*T, 1) and replicate it C times across that dimension to compute loss
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is the current context
        # max_new_tokens is the max number of characters to generate
        for _ in range(max_new_tokens):
            logits, loss = self.forward(idx)
            relevant_logits = logits[:, -1, :] # we get the last time step in the sequence: becomes (B, 1, C).
            probs = F.softmax(relevant_logits, dim = -1) # Becomes (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples = 1)
            idx = torch.cat((idx, idx_next), dim = 1)
        return idx

In [142]:
yb

tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])

In [143]:
m = BigramLanguageModel()
idx = torch.zeros((1, 1), dtype=torch.long)
print(idx.shape, xb.shape, yb.shape)
# logits, loss = m.forward(idx)
logits, loss = m.forward(xb, yb)
print(logits.shape, loss)

torch.Size([1, 1]) torch.Size([4, 8]) torch.Size([4, 8])
torch.Size([32, 65]) tensor(4.8786, grad_fn=<NllLossBackward0>)


In [160]:
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0]))


CCA't-L!BotNX CJ?.yZBCbYiKH;P YkRocBUMykIfFGRetY!uHN.cp$kzo,I&fiMD-rbjJmho,Rpw:vZEQvjKHAUzoTOmw$I.KX


### Let's optimize now

In [161]:
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)
batch_size = 32
for steps in range(100): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=500)[0]))

4.772301197052002

DpJG-gYQThXtN3XNAc;KMbObHRlylTyuF-d,';LGjF&sr'Rjj?jK$?Y!$dD.i
SaQJpOlytJ;'thtriVc'AeDnYUXeawa!uvqHWcsV.uDzHQH:CVJxrhsgUv&y
GbOJQEXotow:'uM qfT'aI-apDqIVlHnUQYsBJ$wLgnqfI!qKoi f'sBN:WKjKBylfRqPN&QTVqr&y'rLftimb
XXgCCnpM:CjRvvV&bC-FtoisyuQ&fSVwUApxEsTVrcMAEZgXXlxitoCU'PwhrIfkId?OEo.RdKo?nQisrdnuGNpx$pZGRr&yyh$-IPy'Ih -Yu$G.pJMOE!q?Z3evX-KAk.IBT!mVjnm
qMDMphYya!UNXAT:CH-bk$Mp$DV,,e:COs&vEsV3m,uybPd?pUGRlx
pxyTsStepkgf?IiHTbR
V?uj?Y3i.aUiGGviClI!U;A-QQW:sk!lj$ Plg
wIFERw fDAAJO'h;
,phsVHxBo.
3n;pJiG


# The above result is quite garbage. Now, let's add attention block so the tokens can communicate with each other

# Create one head of self attention
We will make one change. Instead of each batch, block of size vocab_size, we are going to use an embedding of size n_embed on top of the simple embedding of vocab_size.

In [179]:
n_layer = 4 # Number of transformer blocks to put in sequence
n_head = 4 # number of multi heads in each transformer block

# Parameters for each head are as follows:
batch_size = 16
block_size = 32
n_embed = 64
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'

max_iters = 5000
eval_interval = 500
eval_iters = 200

dropout = 0.0
head_size = n_embed

In [183]:
n_layer = 6 # Number of transformer blocks to put in sequence
n_head = 6 # number of multi heads in each transformer block

# Parameters for each head are as follows:
batch_size = 64
block_size = 256
n_embed = 384 # 64*6 = batch_size * n_head
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'

max_iters = 500
eval_interval = 10
eval_iters = 20

dropout = 0.2
head_size = n_embed

In [184]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train if split == 'train' else test
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'test']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.query = nn.Linear(n_embed, head_size, bias = False)
        self.key = nn.Linear(n_embed, head_size, bias = False)
        self.value = nn.Linear(n_embed, head_size, bias = False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, idx):
        # idx is the current context
        # target is the next predicted character
        # Due to parallel processing, idx and targets are both (batch_size, block_size) tensor of integers
        B, T, C = idx.shape # B = batch size, T = block size, C = n_embed
        k = self.key(idx) # B, T, head_size
        q = self.query(idx) # B, T, head_size
        v = self.value(idx) # B, T, head_size
        # Compute Attention scores. We need to multiply q with k but we can only do that if q is of shape (B, T, head_size) and k is of chape (B, head_size, T)
        attention_scores = q @ k.transpose(-2, -1) * (C**(-0.5)) # (C**(-0.5)) is the normalization inv(sqrt(d_k)) in the origital attention paper
        # attention_scores is of size (B, T, T)

        # For decoder block, we need to do masked fill of attention scores since we don't want future to affect past. Fill upper triangle with -Inf before softmax
        attention_scores = attention_scores.masked_fill(self.tril[:T,:T] == 0, float('-inf')) # (B, T, T)
        softmax_scores = F.softmax(attention_scores, dim=-1)
        softmax_scores_with_dropout = self.dropout(softmax_scores)
        out = softmax_scores_with_dropout @ v # (B, T, T) @ (B, T, head_size) = (B, T, head_size)
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(num_heads*head_size, n_embed)
        self.dropout = nn.Dropout(dropout)

    def forward(self, context):
        out = torch.cat([h(context) for h in self.heads], dim = -1)
        out = self.dropout(self.proj(out))
        return out

class feedforward(nn.Module): # After multi heads, we need a feedforward network
    def __init__(self, n_embed):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(n_embed, 4*n_embed), nn.ReLU(), nn.Linear(4*n_embed, n_embed), nn.Dropout(dropout))

    def forward(self, idx):
        return self.net(idx)

# Now, all building blocks are complete. Let's put a block of transformer together
class TransformerBlock(nn.Module):
    def __init__(self, n_embed, n_head):
        super().__init__()
        head_size = n_embed
        self.multihead = MultiHeadAttention(n_head, head_size)
        self.ffwd = feedforward(n_embed)
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self, idx):
        idx = self.multihead(self.ln1(idx))
        idx = self.ffwd(self.ln2(idx))
        return idx

# Finally, we need to connect the data set into batches, blocks, vocab_size to the TransformerBlock
class GPTModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.position_embedding_table = nn.Embedding(block_size, n_embed)
        self.TransformerBlocks = nn.Sequential(*[TransformerBlock(n_embed, n_head) for _ in range(n_layer)])
        self.layernorm_final = nn.LayerNorm(n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

        self.apply(self._init_weights) # for better initialization

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, context, target = None):
        B, T = context.shape
        
        tok_emb = self.token_embedding_table(context) # B, T, n_embed
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # T, n_embed
        x = tok_emb + pos_emb # B, T, n_embed
        x = self.TransformerBlocks(x)
        x = self.layernorm_final(x)
        logits = self.lm_head(x) # (B, T, vocab_size)
        if target is None:
                loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            target = target.view(B*T)
            loss = F.cross_entropy(logits, target)
    
        return logits, loss
        

    def generate(self, idx, max_new_tokens):
        # idx is the current context. It is of size (B, T)
        # max_new_tokens is the max number of characters to generate
        for _ in range(max_new_tokens):
            logits, loss = self.forward(idx[:, -block_size:]) # We truncate idx to meet only block_size sized tensor array
            relevant_logits = logits[:, -1, :] # we get the last time step in the sequence: becomes (B, 1, C).
            probs = F.softmax(relevant_logits, dim = -1) # Becomes (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples = 1) # (B, 1)
            idx = torch.cat((idx, idx_next), dim = 1) # (B, T+1)
        return idx

In [173]:
model = GPTModel()
m = model.to(device)
print("device = ", device)
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    start = time.time()

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['test']:.4f}, time taken = {time.time()-start} seconds")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
out = m.generate(context, max_new_tokens=500)

0.406337 M parameters
step 0: train loss 4.1873, val loss 4.1890, time taken = 3.625605821609497 seconds
step 500: train loss 3.3215, val loss 3.3581, time taken = 3.7112720012664795 seconds
step 1000: train loss 3.3155, val loss 3.3528, time taken = 3.477132797241211 seconds
step 1500: train loss 3.3086, val loss 3.3451, time taken = 3.185731887817383 seconds
step 2000: train loss 3.3004, val loss 3.3596, time taken = 3.3659539222717285 seconds
step 2500: train loss 3.3075, val loss 3.3368, time taken = 3.3775699138641357 seconds
step 3000: train loss 3.3119, val loss 3.3412, time taken = 3.198637008666992 seconds
step 3500: train loss 3.3073, val loss 3.3518, time taken = 3.5656850337982178 seconds
step 4000: train loss 3.3065, val loss 3.3503, time taken = 3.0073740482330322 seconds
step 4500: train loss 3.3144, val loss 3.3466, time taken = 3.0669801235198975 seconds
step 4999: train loss 3.3141, val loss 3.3520, time taken = 3.7007861137390137 seconds


In [185]:
# n_layer = 4 # Number of transformer blocks to put in sequence
# n_head = 4 # number of multi heads in each transformer block

# # Parameters for each head are as follows:
# batch_size = 16
# block_size = 32
# n_embed = 64
# learning_rate = 3e-4
# device = 'cuda' if torch.cuda.is_available() else 'cpu'

# max_iters = 100
# max_iters = 5000
# eval_interval = 500
# eval_iters = 200

# dropout = 0.0
# head_size = n_embed
print(decode(out[0]))


wf
fahereTtsEtneohc et  Thi edheetyitnt'rwi dsudp aiFtewhl uMe eroronrN UPiosrWnseo,f  Hoi  nmaedraaahnl AO rwThl ridd   vermeoeuliut ilt ,
  eoa:ouu aKGdtdwd c m'irrtmdoeIal:bgly ro dnen   o dud ssoiKihorn, !rtdireprch  se   nr ep itlemedirevlgo ei  Toi f,elOy.o tOoiui I,ttthh
uIidew a, rr tpvtugnpm  Lrsl:nff g  as eTUt:twt
ntaawfnerm en U .brw rsf'Hll,er w  h.  u eincea fsG hoIemoatonoaidwh o itm
oosenCsgyeroniehdaaewth yearbnicMwyvynnhI aaaiqNa.hlesmu ao eoee,o  Tnaaf 
 mAhOdlwhtde
soTctLt;aI


# Let's try a bigger GPT model. Ideally, if you try the following model 10x times (5000 max_iters), you will start seeing some coherent language.

In [None]:
# n_layer = 6 # Number of transformer blocks to put in sequence
# n_head = 6 # number of multi heads in each transformer block

# # Parameters for each head are as follows:
# batch_size = 64
# block_size = 256
# n_embed = 384 # 64*6 = batch_size * n_head
# learning_rate = 3e-4
# device = 'cuda' if torch.cuda.is_available() else 'cpu'

# max_iters = 500
# eval_interval = 10
# eval_iters = 20

# dropout = 0.2
# head_size = n_embed

model = GPTModel()
m = model.to(device)
print("device = ", device)
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    start = time.time()

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['test']:.4f}, time taken = {time.time()-start} seconds")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
out = m.generate(context, max_new_tokens=500)

In [187]:
print(decode(out[0]))


ms
otim:heisn
dAevtinknhr:csi'hvrduRRolCuitut ow dhwt yhtreityi.mw oru
hvie? tnr l a  rnnne hn h eseesm  p  mRtwabth b igaeT:enas iein gepitoobdaneddh hltH ,ertSuyslKsb nm  b onl y teod ynrs,seldd,or ;gtToeTosilyoe itsybmb aryNfAMddR : alhfcr rtph
n
artawdsrn
  ariaIi
vsaema ljw vWh , hodTohNkuyOiseYewChntrIi dtdhy ieado. eha
odesl wueUttaeik,l s
 h dnyhS
rl ere kd C vg
veretEesfh ta  et .yaod lyeBUtem
esA eytt:euya
n,
EuDsAnfolhE,arsEiL uw an.e; leensecedle :hduthd rmko eerAouEd arlB e
j;t wy. 
