In [1]:
import torch
import torch.nn as nn

In [2]:
with open(r"C:\Users\Reyan\Desktop\LLM\Tokenizer\J. K. Rowling - Harry Potter 1 - Sorcerer's Stone.txt",'r') as file:
    raw_text = file.read()

In [3]:
raw_text[:99]

"Harry Potter and the Sorcerer's Stone\n\n\nCHAPTER ONE\n\nTHE BOY WHO LIVED\n\nMr. and Mrs. Dursley, of nu"

In [4]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')

In [5]:
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size, max_length, stride, shuffle, drop_last, num_workers):

    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [6]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,  
    "context_length": 256, 
    "emb_dim": 768,        
    "n_heads": 12,         
    "n_layers": 12,        
    "drop_rate": 0.1,      
    "qkv_bias": False     
}

In [7]:
train_ratio = 0.9
split_idx = int(len(raw_text)*train_ratio)
train_data = raw_text[:split_idx]
val_data = raw_text[split_idx:]

In [8]:
train_loader = create_dataloader_v1(train_data,2,GPT_CONFIG_124M['context_length'],GPT_CONFIG_124M['context_length'],True,True,0)
val_loader = create_dataloader_v1(val_data,2,GPT_CONFIG_124M['context_length'],GPT_CONFIG_124M['context_length'],False,False,0)

In [9]:
print(len(train_loader))
print(len(val_loader))

205
23


In [10]:
class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift

In [11]:
class GELU(nn.Module):
    def __init__(self):
        super().__init__()
    def forward(self,x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) * 
            (x + 0.044715 * torch.pow(x, 3))
        ))

In [12]:
class feedforwardnn(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(cfg['emb_dim'],4*cfg['emb_dim']),
            nn.GELU(),
            nn.Linear(4*cfg['emb_dim'],cfg['emb_dim'])
        )
    def forward(self,x):
        return self.layer(x)

In [13]:
class multiheadattentionv2(nn.Module):
    def __init__(self,d_in,d_out,context_len,num_head):
        super().__init__()
        self.d_out = d_out
        self.wq= torch.nn.Linear(d_in,d_out)
        self.wk= torch.nn.Linear(d_in,d_out)
        self.wv= torch.nn.Linear(d_in,d_out)

        self.mask = torch.triu(torch.ones(context_len,context_len),diagonal=1)
        self.num_head = num_head
        self.head_dim = d_out//self.num_head
        self.out_pro = torch.nn.Linear(d_out,d_out)

    def forward(self,x):
        b,num_token,d_in = x.shape
        query = self.wq(x)
        key = self.wk(x)
        value = self.wv(x)

        #d_out = num_head*head_dim
        query = query.view(b,num_token,self.num_head,self.head_dim)
        key = key.view(b,num_token,self.num_head,self.head_dim)
        value = value.view(b,num_token,self.num_head,self.head_dim)

        query = query.transpose(1,2)
        key = key.transpose(1,2)
        value = value.transpose(1,2)

        att_score = query @ key.transpose(2,3)
        masked_att_score = att_score.masked_fill(self.mask.bool()[:num_token, :num_token],-torch.inf)
        masked_att_weight = torch.softmax(masked_att_score/(key.shape[-1]**0.5),dim=-1)

        context_vec = (masked_att_weight @ value).transpose(1,2)
        context_vec = context_vec.contiguous().view(b, num_token, self.d_out)
        context_vec = self.out_pro(context_vec)

        return context_vec

In [14]:
class Transformer(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.layernorm1 = LayerNorm(cfg['emb_dim'])
        self.layernorm2 = LayerNorm(cfg['emb_dim'])
        self.ff = feedforwardnn(cfg)
        self.mha = multiheadattentionv2(d_in=cfg['emb_dim'],d_out=cfg['emb_dim'],context_len=cfg['context_length'],num_head=cfg['n_heads'])

    def forward(self,x):
        shortcut=x
        x=self.layernorm1(x)
        x=self.mha(x)
        x=x+shortcut
        
        shortcut=x
        x=self.layernorm2(x)
        x=self.ff(x)
        x=x+shortcut
        return x

In [15]:
class GPT(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.token_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.positional_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.dropout = nn.Dropout(cfg['drop_rate'])

        self.trf = nn.Sequential(
            *[Transformer(cfg) for _ in range(cfg['n_layers'])]
        )
        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(cfg["emb_dim"], cfg["vocab_size"])
        
    def forward(self,input_idx):
        b,seq_len = input_idx.shape
        tok_embed = self.token_emb(input_idx)
        pos_embed = self.positional_emb(torch.arange(seq_len))
        x = tok_embed + pos_embed
        x = self.dropout(x)
        x = self.trf(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [16]:
model = GPT(GPT_CONFIG_124M)
model.eval()

GPT(
  (token_emb): Embedding(50257, 768)
  (positional_emb): Embedding(256, 768)
  (dropout): Dropout(p=0.1, inplace=False)
  (trf): Sequential(
    (0): Transformer(
      (layernorm1): LayerNorm()
      (layernorm2): LayerNorm()
      (ff): feedforwardnn(
        (layer): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (mha): multiheadattentionv2(
        (wq): Linear(in_features=768, out_features=768, bias=True)
        (wk): Linear(in_features=768, out_features=768, bias=True)
        (wv): Linear(in_features=768, out_features=768, bias=True)
        (out_pro): Linear(in_features=768, out_features=768, bias=True)
      )
    )
    (1): Transformer(
      (layernorm1): LayerNorm()
      (layernorm2): LayerNorm()
      (ff): feedforwardnn(
        (layer): Sequential(
          (0): Linear(in_features=768, out_features=

In [17]:
def nextwordprediction(model,idx,max_new_token,context_len):
    for _ in range(max_new_token):
        idx_cond = idx[:,-context_len:]
        logits = model(idx_cond)
        logit = logits[:,-1,:]
        prob = torch.softmax(logit,dim=-1)
        idx_next = torch.argmax(prob,dim=-1,keepdim=True)
        idx = torch.cat((idx,idx_next),dim=1)
    return idx

In [20]:
def cal_loss(input_batch,target_batch,model):
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0,1),target_batch.flatten())
    return loss

def loss_function(data_loader,num_batch=None):
    total_loss=0
    if num_batch is None:
        num_batch = len(data_loader)
    num_batch = min(num_batch,len(data_loader))
    for i,(input_batch,target_batch) in enumerate(data_loader):
        if i < num_batch:
            loss = cal_loss(input_batch,target_batch,GPT(GPT_CONFIG_124M))
            total_loss += loss.item()
        else:
            break
    return total_loss/num_batch

In [21]:
train_loss = loss_function(train_loader)
val_loss = loss_function(val_loader)
print('train_loss:',train_loss)
print('val_loss:',val_loss)

train_loss: 10.995641010563547
val_loss: 10.992317199707031


In [22]:
def training(num_epoch,train_loader,val_loader,optimizer,model,eval_freq,eval_iter,tokenizer):
    train_losses=[]
    val_losses=[]
    global_step=-1
    for epoch in range(num_epoch):
        model.train()
        for input_batch,target_batch in train_loader:
            optimizer.zero_grad()
            loss = cal_loss(input_batch,target_batch,model)
            loss.backward()
            optimizer.step()
            global_step+=1

            if global_step%eval_freq ==0:
                train_loss = loss_function(train_loader,num_batch=eval_iter)
                val_loss = loss_function(val_loader,num_batch=eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "f"Train loss {train_loss:.3f}, Val loss {val_loss:.3f}")

    return train_losses,val_losses

In [None]:
model = GPT(GPT_CONFIG_124M)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)
num_epochs = 10
train_losses, val_losses = training(num_epochs,train_loader, val_loader, optimizer, model, eval_freq=5, eval_iter=5,tokenizer=tokenizer)

Ep 1 (Step 000000): Train loss 11.031, Val loss 11.005
Ep 1 (Step 000005): Train loss 10.975, Val loss 10.985
Ep 1 (Step 000010): Train loss 10.969, Val loss 10.983
Ep 1 (Step 000015): Train loss 10.984, Val loss 10.965
Ep 1 (Step 000020): Train loss 10.989, Val loss 11.000
Ep 1 (Step 000025): Train loss 10.985, Val loss 10.962
Ep 1 (Step 000030): Train loss 10.979, Val loss 10.991
Ep 1 (Step 000035): Train loss 11.011, Val loss 10.993
Ep 1 (Step 000040): Train loss 10.988, Val loss 11.037
Ep 1 (Step 000045): Train loss 10.983, Val loss 10.994
Ep 1 (Step 000050): Train loss 11.003, Val loss 11.000
Ep 1 (Step 000055): Train loss 10.974, Val loss 10.994
Ep 1 (Step 000060): Train loss 11.000, Val loss 10.982
Ep 1 (Step 000065): Train loss 10.995, Val loss 11.006
Ep 1 (Step 000070): Train loss 10.988, Val loss 10.976
Ep 1 (Step 000075): Train loss 10.971, Val loss 11.008
Ep 1 (Step 000080): Train loss 10.954, Val loss 11.014
Ep 1 (Step 000085): Train loss 11.009, Val loss 10.991
Ep 1 (Step