In [108]:
#importing all the libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
from warnings import filterwarnings
filterwarnings('ignore')

In [156]:
#config
input_file_path = 'data/input.txt' #reading the input text file
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'currently running on: {device}')
batch_size = 16
block_size = 256 #context lenght
n_emb = 128
num_heads = 6 
n_layers = 6
dropout = 0.0
learning_rate = 1e-3 
min_lr = learning_rate * 0.1
max_steps = 2000
lr_decay_iters = max_steps
eval_interval = 250
KFold_iters = 100
# top_k = 200
# temperature = 0.1

currently running on: cuda


In [157]:
#for reproducibility
torch.manual_seed(1337)
torch.cuda.manual_seed(1337)
torch.cuda.empty_cache()

In [158]:
#reading the dataset for pretraining

with open(input_file_path , 'r') as f:
    text = f.read()
n1 = int(0.9*len(text))
chars = sorted(set(text))
vocab_size = len(chars)
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encoder = lambda x: [stoi[ch] for ch in x]
decoder = lambda x: ''.join([itos[i] for i in x])

data = torch.tensor(encoder(text),dtype = torch.long,device=device)

In [159]:
#splitting train and test & creating data loading batch
n1 = int(0.8*len(data))
n2 = int(0.9*len(data))
train_data = data[:n1]
dev_data = data[n1:n2]
test_data = data[n2:]

def get_batch(split):
    if split == 'train':
        data = train_data
    elif split == 'val':
        data = dev_data
    elif split == 'test':
        data = test_data
    ix = torch.randint(0, len(data) - block_size,(batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1: i+block_size + 1] for i in ix])
    x,y = x.to(device) , y.to(device)
    return x,y

In [186]:
no_of_batches_in_epoch = len(train_data) // (batch_size*block_size)
print('no of bathces in an epoch:',no_of_batches_in_epoch )

no of bathces in an epoch: 217


In [161]:
#model definition: GPT model
class Head(nn.Module):
    def __init__(self,head_size):
        super().__init__()
        self.key = nn.Linear(n_emb,head_size, bias = False)
        self.query = nn.Linear(n_emb,head_size, bias = False)
        self.value = nn.Linear(n_emb,head_size, bias = False)
        self.register_buffer('tril' , torch.tril(torch.ones(block_size,block_size)))
        self.dropout = nn.Dropout(dropout)
    
    def forward(self,x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)
        # wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5
        # wei = wei.masked_fill(self.tril[:T,:T] == 0, float('-inf'))
        # wei = F.softmax(wei, dim = -1)
        # wei = self.dropout(wei)
        # out = wei @ v
        out = F.scaled_dot_product_attention(q,k,v,is_causal= True) #flash attention
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self,num_heads,head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for i in range(num_heads)])
        self.proj = nn.Linear(num_heads * head_size , n_emb)
        self.proj.NANO_GPT_INIT = 1
        self.dropout = nn.Dropout(dropout)

    def forward(self,x):
        out = torch.cat([h(x) for h in self.heads], dim = -1)
        out = self.dropout(self.proj(out))
        return out


class FeedForwardNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.NANO_GPT_INIT = 1
        self.net = nn.Sequential(
            nn.Linear(n_emb, 4 * n_emb),
            nn.ReLU())
        self.proj = nn.Linear(n_emb * 4, n_emb)
        self.proj.NANO_GPT_INIT = 1
        self.dropout = nn.Dropout(dropout)
        
    def forward(self,x):
        out = self.dropout(self.proj(self.net(x)))
        return out


class Block(nn.Module):
    def __init__(self,num_heads):
        super().__init__()
        head_size = n_emb // num_heads
        self.sa = MultiHeadAttention(num_heads,head_size)
        self.ff = FeedForwardNetwork()
        self.ln1 = nn.LayerNorm(n_emb)
        self.ln2 = nn.LayerNorm(n_emb)

    def forward(self,x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x
        

class GPTModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding_table = nn.Embedding(vocab_size,n_emb)
        self.position_emb = nn.Embedding(block_size,n_emb)
        self.block = nn.Sequential(*[Block(num_heads) for i in range(n_layers)])
        self.ln = nn.LayerNorm(n_emb)
        self.lm_head = nn.Linear(n_emb,vocab_size)
        #weight sharing scheme
        self.embedding_table.weight = self.lm_head.weight
        self.apply(self._init_weights)
        
    def _init_weights(self,module):
        std = 0.02
        if isinstance(module,nn.Linear):
            if hasattr(module, 'NANO_GPT_INIT'):
                std = 2 * (n_layers**-0.5)
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module,nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=std)

    
    def forward(self,idx,targets = None):
        B,T = idx.shape
        token_emb = self.embedding_table(idx)
        pos_emb = self.position_emb(torch.arange(T,device = device))
        x = token_emb + pos_emb
        x = self.block(x)
        x = self.ln(x)
        logits = self.lm_head(x)
        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits,targets)
        return logits,loss

    def generate(self,idx,max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:,-block_size:]
            logits,loss = self.forward(idx_cond)
            logits = logits[:,-1,:] 
            probs = F.softmax(logits, dim =-1)
            #topk_probs, topk_indices = torch.topk(probs, top_k, dim = -1)
            idx_new = torch.multinomial(probs, num_samples = 1, replacement = False)
            idx = torch.cat((idx_cond,idx_new), dim = 1)
        return idx

In [162]:
#call the model and printing trainable model params
model = GPTModel()
model.to(device)
print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

1.222593 M parameters


In [163]:
#initializing optimier
optimizer = torch.optim.AdamW(model.parameters(),lr = learning_rate, betas=(0.9,0.95),weight_decay= 0.1)
epoch = 0

In [164]:
@torch.no_grad()
def estimate_loss():
    model.eval()
    out = {}
    for split in ['train','val']:
        losses = torch.zeros(KFold_iters)
        for i in range(KFold_iters):
            xb,yb = get_batch(split)
            logits,loss = model.forward(xb,yb)
            losses[i] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [176]:
#training loop
for step in range(4):
    for i in range(max_steps):
        if i % no_of_batches_in_epoch == 0: 
            epoch +=1
        if i % eval_interval == 0:
            losses = estimate_loss()
            print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        optimizer.zero_grad(set_to_none=True)
        x,y = get_batch('train')
        with torch.autocast(device_type=device,dtype = torch.float16):  
            logits , loss = model.forward(x,y)
        loss.backward()
        norm = nn.utils.clip_grad_norm_(model.parameters(),1) #gpt3 paper says
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer,max_steps,min_lr) #gpt 3 paper says
        optimizer.step()
        scheduler.step(i)
        torch.cuda.synchronize()
        print(f"step {i} | loss_acum:{loss.item():.6f} | lr: {scheduler.get_lr()[0]: .4e} | norm:{norm: .4f}")
        if i%200 == 0:
            context = torch.zeros(1,1 , dtype = torch.long, device = device)
            out = decoder(model.generate(context, 500).tolist()[0])
            print(out)

step 0: train loss 1.3741, val loss 1.5463
step 0 | loss_acum:1.293981 | lr:  1.0000e-03 | norm: 1.0638
me
And eyeBurned in Hereford him?

PRINCE EDWARD:
Sorrow and his remourable,
Thre teding of chastor'd my stay,
Dire company bads, or death crowned the first
Since this truth butters: sail those for butchiry,
Fatces to the merry thy will one can under cleept
step 1 | loss_acum:1.361403 | lr:  1.0000e-03 | norm: 1.0950
step 2 | loss_acum:1.332302 | lr:  1.0000e-03 | norm: 1.8844
step 3 | loss_acum:1.403914 | lr:  9.9999e-04 | norm: 1.7256
step 4 | loss_acum:1.430158 | lr:  9.9999e-04 | norm: 1.7355
step 5 | loss_acum:1.534000 | lr:  9.9998e-04 | norm: 1.5947
step 6 | loss_acum:1.507087 | lr:  9.9997e-04 | norm: 1.5691
step 7 | loss_acum:1.482707 | lr:  9.9997e-04 | norm: 1.6577
step 8 | loss_acum:1.449585 | lr:  9.9996e-04 | norm: 1.5860
step 9 | loss_acum:1.520351 | lr:  9.9995e-04 | norm: 1.7364
step 10 | loss_acum:1.451004 | lr:  9.9993e-04 | norm: 1.4811
step 11 | loss_acum:1.45998

In [177]:
losses['train'],losses['val'] #last recorded Train Val Loss

(tensor(1.3115), tensor(1.5269))

In [179]:
print(epoch)

40


In [180]:
for i in range(5):
    context = torch.zeros(1,1 , dtype = torch.long, device = device)
    out = decoder(model.generate(context, 300).tolist()[0])
    print(out)

ne's toward king!

RICHMOND:
Hold received the grant are well caught her,
Side even chysel. Have wask me her one safe
Than I was acpin to your viols talk in
His upon you: I own mean bhild up awas than thee
are is of fak! my tongued and doings; for eath
of m
ion,
More to one so not lightly spirits,
But save appriof at officersation me
To swill givil out enter to the mileguable,
Being and to-night root aritoroous be thus
Time letters for him up to me a
For some left by of late him that took find appear,
Ere devi

Thou should I think what he asvices; therefore
To bear enmity, that to slow their do
that the bover of all-hood your limbs:
You wilt first.

ANGELO:
If my best, 'let y--


ISABELLA:
O the city; enough, look to hear shall be comes!--

BENVOLIO:
For Marcius:
ays to fine mad, sit by this ours.
To fleser, pardon, truspetitlut, the soon.

SICINIUS:
Now it is sow; since take you of thy mother's
With slain off, let I have drownrage, let you
do.

MENENIUS:
O--Mercy, if you,
Love's where

In [181]:
@torch.no_grad()
def estimate_loss1():
    model.eval()
    out = {}
    for split in ['train','val','test']:
        losses = torch.zeros(KFold_iters)
        for i in range(KFold_iters):
            xb,yb = get_batch(split)
            logits,loss = model.forward(xb,yb)
            losses[i] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [184]:
losses = estimate_loss1()
print(f" train loss {losses['train']:.4f}, val loss {losses['val']:.4f},  dev loss {losses['test']:.4f}")

 train loss 1.3055, val loss 1.5195,  dev loss 1.7704
