In [2]:
import os, sys
import ipdb  # for debugging
from tqdm import tqdm
from datetime import datetime
import platform, shutil  # detect platform type
import requests, zipfile, io

# Pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F

import sentencepiece as spm  # For the tokenizer

# These lines improve performance for Ampere Architecture (e.g: A100s)
torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
# Empty GPU cache memory
torch.cuda.empty_cache()

# A-) PREWORKING

### 1-) Requirements installed

In [3]:
#!pip install wandb
# 22af9a162cd0b2ad0d4643a01a00657222e874bd
#!pip install jupyter notebook

In [4]:
#!nvidia-smi

In [5]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print the device being used
print(f"Using device: {device}")


Using device: cuda


### 2-) Parameters

- Architecture Parameters

In [6]:
# Transformers encoder inputs will be (8, 512, 384) # 8 batches, 512 features/token each iteration, 384 token vector
batch_size = 8 # number of samples in each iteration
context_size = 512  # how many words/tokens will be taken each iteration
embedded_size = 384  # vector size for the each token
n_layers = 7 # number of layers in the encoder layer
n_heads = 7 # number of heads in the multi-head attention mechanism
BIAS = True # if True, add bias to the output of the linear layer

- HyperParameters 

In [7]:
lr = 3e-4
dropout = 0.05 # randomly zero out some input units with probability dropout. This avoids overfitting.
weight_decay = 0.01 # regularization parameter for weight decay. Smaller values will result in stronger regularization. like L1 or L2 regularization.
grad_clip = 1.0 # avoid exploding gradients.

- Training Parameters

In [27]:
train_iterations = 100000
eval_interval = 50 # each 50 iterations, the model will be evaluated on the validation set
eval_iters = 10
compile = False # if True, the model will be compiled before training
checkpoint_path = 'data/models/' # path to save the model checkpoint
checkpoint_fn = 'data/my_latest.pt' # filename for the model checkpoint
checkpoint_load_fn = 'data/my_latest.pt' # filename for the model checkpoint
dtype = torch.bfloat16 # data type for the model

- Mode

In [9]:
inference = False # if True, the model will be used for inference
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


### 3-) wandb Logging

In [16]:
#import wandb
#wandb.login()

wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
wandb: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
wandb: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:wandb: Appending key for api.wandb.ai to your netrc file: C:\Users\User\_netrc


True

In [17]:
wandb_log = True 
wandb_project = "llm1"
wandb_run_name = "llm1" +datetime.now().strftime("%Y%m%d-%H%M%S")

if wandb_log:
    import wandb
    wandb.init(project=wandb_project, name=wandb_run_name)
    """ wandb.config.update({
        "lr": lr,
        "dropout": dropout,
        "weight_decay": weight_decay,
        "grad_clip": grad_clip,
        "train_iterations": train_iterations,
        "eval_interval": eval_interval,
        "eval_iters": eval_iters,
        "compile": compile,
        "checkpoint_path": checkpoint_path,
        "checkpoint_fn": checkpoint_fn,
        "checkpoint_load_fn": checkpoint_load_fn,
        "dtype": dtype,
        "inference": inference,
        "device": device
    }) """


wandb: Currently logged in as: ahmet-erdonmez77 (ahmet-erdonmez77-dci). Use `wandb login --relogin` to force relogin


### 4-) Load Dataset

In [19]:
with open('data/wiki.txt', 'r', encoding="utf-8") as f:
    text = f.read()
print(f"Dataset size: {len(text)} characters")
print(text[30000:30300])

Dataset size: 178255102 characters
terms.
For example, there are objects in two groups (as shown on the right). The objects are various shapes, where one group has 3 of them while the other has 2. When the two groups combine into one, the overall amount (sum) of the shapes become 5.

Vertical Addition

The animation above demonstrate


### 5-) Tokenize the dataset

In [20]:
sp = spm.SentencePieceProcessor(model_file="data/wiki_tokenizer.model")  # spm.SentencePieceProcessor is more advanced than NLTK's word_tokenize
vocab_size = sp.vocab_size()
print(vocab_size)

4096


In [21]:
def encode(s):
    return sp.Encode(s)
def decode(s):
    return sp.Decode(s)

In [22]:
print(encode("The quick brown fox jumps over the lazy dog."))
print(decode(encode("The quick brown fox jumps over the lazy dog.")))

[310, 4031, 116, 2895, 1090, 570, 285, 1172, 599, 1853, 4039, 751, 264, 314, 817, 4049, 3429, 4051]
The quick brown fox jumps over the lazy dog.


- Tutor created "encoded_data.pt" for time saving

In [23]:
if os.path.exists('data/encoded_data.pt'):
    data = torch.load('data/encoded_data.pt')
else: # shows how to create encoded_data.pt
    encoded_data = torch.tensor(encode(text), dtype=torch.long)
    torch.save(encoded_data, 'data/encoded_data.pt')

In [26]:
data, len(data)

(tensor([4031,   13, 4061,  ...,   13,   13,   13]), 59211077)

### 6-) Define the model

In [28]:
data_size = len(data)
spl = int(0.9 * data_size)
train_data = data[:spl] # 90% of the data for training
val_data = data[spl:] # 10% of the data for validation

print(f"Total data size: {data_size/1e6:.2f} Millions | Train data size: {len(train_data)/1e6:.2f} Millions | Validation data size: {len(val_data)/1e6:.2f}")

Total data size: 59.21 Millions | Train data size: 53.29 Millions | Validation data size: 5.92


In [30]:
# we will get 8 times of 512 tokens in each batch (8,512)
def get_batch(split):
    data = train_data if split == "train" else val_data
    inds = torch.randint(len(data) - context_size, (batch_size,)) # batch_size 8 and context_size 512. Each batch will have 8 examples. and each example will have 512 tokens.
    X = torch.stack([data[i:i+context_size] for i in inds])  # If we did not do (len(data) - context_size) instead of len(data), we would get out of range error
    y = torch.stack([data[i+1: i+context_size+1] for i in inds])# if above is 1000:1512 => this 1001:1513. we move window one token forward
    
    return X.to(device), y.to(device)

In [32]:
x,y = get_batch("train")
x.shape, y.shape

(torch.Size([8, 512]), torch.Size([8, 512]))

- THE MAGIC IS WE FORWARD WINDOW 1 TOKEN AHEAD EACH TIME.

In [33]:
print(x[0][:10])
print(y[0][:10])

tensor([ 709,  379,  658,   13,   13, 3463,  442,  709,  379,  658],
       device='cuda:0')
tensor([ 379,  658,   13,   13, 3463,  442,  709,  379,  658,  299],
       device='cuda:0')


# B-) TRANSFORMER MODEL

### 1-) Define the transformer block

In [78]:
class GPT(nn.Module):
    def __init__(self):
        super(GPT, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedded_size) # 4096 x 384 each vocab will have 384 size vector
        self.positions = nn.Embedding(context_size, embedded_size) # 512 x 384 we define positions of each token
        self.blocks = nn.Sequential(*[Block(n_heads) for _ in range(n_layers)])# creates n_layers of transformer blocks and each block has n_heads heads
        self.ln = nn.LayerNorm(embedded_size) # Z-transform
        self.final_linear = nn.Linear(embedded_size,vocab_size, bias=BIAS)
        
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module,nn.Linear):
            #module.weight.data.normal_(mean=0.0, std=0.02)
            torch.nn.init.normal_(module.weight, std=0.02, mean=0.0)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, std=0.02, mean=0.0)
            
    def forward(self, input, targets = None): 
        # BS = Batch Size, SL=sequence Length or context Size
        loss = None
        BS, SL = input.shape # for us 8 x 512
        emb = self.embeddings(input) # for us 8 x 512 x 384
        position_ = self.positions(torch.arange(SL, device=device)) # for us 512 x 384
        
        x = emb + position_  # for us 8 x 512 x 384
        
        x = self.blocks(x) # pass through transformer blocks. BS x SL x 384
        x = self.ln(x)  # normalization. BS x SL x 384 (embedding size)

        logits = self.final_linear(x)  # pass through linear layer BS x SL x 4096 (vocab size)
        #probs = torch.softmax(logits, dim=-1)  # get probabilities for each token. BS x SL x 4096
        
        
        
        if targets is not None:
            BS, SL, VS = logits.shape
            logits = logits.view(BS*SL, VS)  # BS*SL x 4096
            targets = targets.view(BS*SL)  # BS*SL x 1
            loss = F.cross_entropy(logits, targets)  # calculate loss for each token. BS*SL x 1
            
            # manually calculate loss
            #counts = logits.exp()
            #prob = counts / counts.sum(dim=-1, keepdim=True)
            #loss2 = - prob[torch.arange(BS*SL), targets].log().mean()
            
            #if(not torch.allclose(loss, loss2)):
            #print(f"[Loss Difference] Pytorch: {loss.item()} vs Manual: {loss2.item()}")
        
        return logits, loss
    
    # Predicts next token based on previous tokens
    def generate(self, input, max_length=500):
        for _ in range(max_length):
            input = input[:, -context_size:] # take last context_size, 512 of tokens 
            logits, _ = self.forward(input) # (1, input_size, vocab_size)
            logits = logits[:, -1, :]  # pick the last token's logits (1, vocab_size)
            probs = F.softmax(logits, dim=-1) # get probabilities for each token (1, vocab_size)
            next_token = torch.multinomial(probs, 1)#.squeeze(1)  # sample the next token (1, 1)
            input = torch.cat((input, next_token), dim=1)  # add the sampled token to the input
            
        return input
        


class Block(nn.Module):
    def __init__(self, n_heads):
        super().__init__()
        
        head_size = embedded_size // n_heads # 384 // 7 ~ 54
        self.ma = MultiHeadAttention(n_heads, head_size)
        self.feed_forward = ForwardLayer(embedded_size)
        self.ln1 = nn.LayerNorm(embedded_size) # Z-transform
        self.ln2 = nn.LayerNorm(embedded_size) # Z-transform
        
    def forward(self, x):
        x = x + self.ma(self.ln1(x))
        x = x + self.feed_forward(self.ln2(x))
        return x
        

class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
        self.combine = nn.Linear(n_heads*head_size, embedded_size) # (7*54 ,384)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = torch.cat([head(x) for head in self.heads], dim=-1) # dim=-1 do this on last dimension. dim = 0 do this on first dimension.
        # each head outputs (BS, SL, head_size)
        x = self.combine(x)
        x = self.dropout(x)
        return x 
        
        

class ForwardLayer(nn.Module):
    def __init__(self, embedded_size):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(embedded_size, 6*embedded_size, bias=BIAS),
            nn.GELU(),
            nn.Linear(6*embedded_size, embedded_size, bias=BIAS),
            nn.Dropout(dropout)
        )
        
    def forward(self, x):
        return self.network(x)
    
    
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.q = nn.Linear(embedded_size, head_size, bias=BIAS)
        self.k = nn.Linear(embedded_size, head_size, bias=BIAS)
        self.v = nn.Linear(embedded_size, head_size, bias=BIAS)
        self.out = nn.Linear(head_size, embedded_size, bias=BIAS)
        
        

In [79]:
x,y = get_batch("train")
print(x[0][:10]) # inputs
print(y[0][:10]) # targets

model = GPT()

model = model.to(dtype)
model = model.to(device)

logits, loss = model(x, y)
print(loss.item())


tensor([4053,  830,  909, 1714,  289,  540, 3933, 2340,  490,  827],
       device='cuda:0')
tensor([ 830,  909, 1714,  289,  540, 3933, 2340,  490,  827,  372],
       device='cuda:0')


TypeError: MultiHeadAttention.forward() missing 2 required positional arguments: 'key' and 'value'

In [None]:
arr = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]).reshape(2, 4)
arr[:, :3]

In [80]:
384//7

54

### 2-) Generate a sample 

In [73]:
@torch.no_grad()
def generate_sample(input):
    t1 = torch.tensor(encode(input), dtype=torch.long, device=device).unsqueeze(0) # (1, size of ids)
    #t1 = t1[None, :] # (1, size of ids)
    newgen = model.generate(t1, max_length=64)[0].tolist()
    
    result = decode(newgen)
    
    print(result)
    
generate_sample("The quick brown fox jumps over the lazy dog.")

The quick brown fox jumps over the lazy dog.ensusged Maxargeralaska species Kansaspeciallypr Secretary uses AlAr0 educ crime El manager joinedaut miles cardarth Loulic African met Leva transtt no different Inter up itself� Ab Wood wid Smith Dou becomingrict Choms comple angosestit L ph perSps Armyicianether believeelled sen getmy


In [68]:
a = torch.tensor([2,3,4,5,6,7,8,9,10,11])
a, a.shape, a.unsqueeze(0).shape,

(tensor([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11]),
 torch.Size([10]),
 torch.Size([10, 1]))

In [75]:
384/7

54.857142857142854