In [352]:
import os, sys
import ipdb  # for debugging
from tqdm import tqdm
from datetime import datetime
import platform, shutil  # detect platform type
import requests, zipfile, io
import math

# Pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn import DataParallel

import sentencepiece as spm  # For the tokenizer

# These lines improve performance for Ampere Architecture (e.g: A100s)
torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
# Empty GPU cache memory
torch.cuda.empty_cache()

# A-) PREWORKING

### 1-) Requirements installed

In [353]:
#!pip install wandb
# 22af9a162cd0b2ad0d4643a01a00657222e874bd
#!pip install jupyter notebook

In [354]:
#!nvidia-smi

In [355]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print the device being used
print(f"Using device: {device}")


Using device: cuda


### 2-) Parameters

- Architecture Parameters

In [356]:
# Transformers encoder inputs will be (8, 512, 384) # 8 batches, 512 features/token each iteration, 384 token vector
batch_size = 8 # number of samples in each iteration
context_size = 512  # how many words/tokens will be taken each iteration
embedded_size = 384  # vector size for the each token
n_layers = 7 # number of layers in the encoder layer
n_heads = 7 # number of heads in the multi-head attention mechanism
BIAS = True # if True, add bias to the output of the linear layer

- HyperParameters 

In [357]:
lr = 3e-4
dropout = 0.05 # randomly zero out some input units with probability dropout. This avoids overfitting.
weight_decay = 0.01 # regularization parameter for weight decay. Smaller values will result in stronger regularization. like L1 or L2 regularization.
grad_clip = 1.0 # avoid exploding gradients.

- Training Parameters

In [358]:
train_iters = 10000
eval_interval = 50 # each 50 iterations, the model will be evaluated on the validation set
eval_iters = 3
compile = False # if True, the model will be compiled before training
checkpoint_dir = "models/"  # Where do we store checkpoints?
checkpoint_fn = "latest.pt"
# Name of checkpoint file to be saved during training
checkpoint_load_fn = "latest.pt"
load_pretrained = False  # Do we want to load a pretrained model to continue training?
dtype = torch.bfloat16 # data type for the model

- Mode

In [359]:
inference = False # if True, the model will be used for inference
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cuda


### 3-) wandb Logging

In [360]:
#import wandb
#wandb.login()

In [361]:
wandb_log = True 
wandb_project = "llm1"
wandb_run_name = "llm1" +datetime.now().strftime("%Y%m%d-%H%M%S")

if wandb_log:
    import wandb
    wandb.init(project=wandb_project, name=wandb_run_name)
    """ wandb.config.update({
        "lr": lr,
        "dropout": dropout,
        "weight_decay": weight_decay,
        "grad_clip": grad_clip,
        "train_iterations": train_iterations,
        "eval_interval": eval_interval,
        "eval_iters": eval_iters,
        "compile": compile,
        "checkpoint_path": checkpoint_path,
        "checkpoint_fn": checkpoint_fn,
        "checkpoint_load_fn": checkpoint_load_fn,
        "dtype": dtype,
        "inference": inference,
        "device": device
    }) """


### 4-) Load Dataset

In [362]:
""" files_url = "https://ideami.com/llm_train"

# Downloading proceeds if we detect that one of the key files to download is not present
if not os.path.exists(f"encoded_data.pt"):
    print("Downloading files using Python")
    response = requests.get(files_url)
    zipfile.ZipFile(io.BytesIO(response.content)).extractall(".")
else:
    print(
        "you seem to have already downloaded the files. If you wish to re-download them, delete the encoded_data.pt file"
    )
 """

' files_url = "https://ideami.com/llm_train"\n\n# Downloading proceeds if we detect that one of the key files to download is not present\nif not os.path.exists(f"encoded_data.pt"):\n    print("Downloading files using Python")\n    response = requests.get(files_url)\n    zipfile.ZipFile(io.BytesIO(response.content)).extractall(".")\nelse:\n    print(\n        "you seem to have already downloaded the files. If you wish to re-download them, delete the encoded_data.pt file"\n    )\n '

In [363]:
with open('data/wiki.txt', 'r', encoding="utf-8") as f:
    text = f.read()
print(f"Dataset size: {len(text)} characters")
print(text[30000:30300])

Dataset size: 178255102 characters
terms.
For example, there are objects in two groups (as shown on the right). The objects are various shapes, where one group has 3 of them while the other has 2. When the two groups combine into one, the overall amount (sum) of the shapes become 5.

Vertical Addition

The animation above demonstrate


### 5-) Tokenize the dataset

In [390]:
sp = spm.SentencePieceProcessor(model_file="data/wiki_tokenizer.model")  # spm.SentencePieceProcessor is more advanced than NLTK's word_tokenize
vocab_size = sp.GetPieceSize()
print(vocab_size)

4096


In [365]:
def encode(s):
    return sp.Encode(s)
def decode(s):
    return sp.Decode(s)

In [366]:
print(encode("The quick brown fox jumps over the lazy dog."))
print(decode(encode("The quick brown fox jumps over the lazy dog.")))

[310, 4031, 116, 2895, 1090, 570, 285, 1172, 599, 1853, 4039, 751, 264, 314, 817, 4049, 3429, 4051]
The quick brown fox jumps over the lazy dog.


- Tutor created "encoded_data.pt" for time saving

In [367]:
if os.path.exists('data/encoded_data.pt'):
    data = torch.load('data/encoded_data.pt')
else: # shows how to create encoded_data.pt
    encoded_data = torch.tensor(encode(text), dtype=torch.long)
    torch.save(encoded_data, 'data/encoded_data.pt')

In [368]:
data, len(data)

(tensor([4031,   13, 4061,  ...,   13,   13,   13]), 59211077)

### 6-) Define the model

In [369]:
data_size = len(data)
spl = int(0.9 * data_size)
train_data = data[:spl] # 90% of the data for training
val_data = data[spl:] # 10% of the data for validation

print(f"Total data size: {data_size/1e6:.2f} Millions | Train data size: {len(train_data)/1e6:.2f} Millions | Validation data size: {len(val_data)/1e6:.2f}")

Total data size: 59.21 Millions | Train data size: 53.29 Millions | Validation data size: 5.92


In [370]:
# we will get 8 times of 512 tokens in each batch (8,512)
def get_batch(split):
    data = train_data if split == "train" else val_data
    inds = torch.randint(len(data) - context_size, (batch_size,)) # batch_size 8 and context_size 512. Each batch will have 8 examples. and each example will have 512 tokens.
    X = torch.stack([data[i:i+context_size] for i in inds])  # If we did not do (len(data) - context_size) instead of len(data), we would get out of range error
    y = torch.stack([data[i+1: i+context_size+1] for i in inds])# if above is 1000:1512 => this 1001:1513. we move window one token forward
    
    return X.to(device), y.to(device)

In [371]:
x,y = get_batch("train")
x.shape, y.shape

(torch.Size([8, 512]), torch.Size([8, 512]))

- THE MAGIC IS WE FORWARD WINDOW 1 TOKEN AHEAD EACH TIME.

In [372]:
print(x[0][:10])
print(y[0][:10])

tensor([ 682, 1429,  983,  302,  501, 1161, 1064,  293,  615,  261],
       device='cuda:0')
tensor([1429,  983,  302,  501, 1161, 1064,  293,  615,  261, 2108],
       device='cuda:0')


# B-) TRANSFORMER MODEL

### 1-) Define the transformer block

In [373]:
# embedded size is 384, context_size is 512, vocab_size is 4096, n_heads is 7, BIAS is True, n_layers is 7
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedded_size) # 4096 x 384 each vocab will have 384 size vector
        self.positions = nn.Embedding(context_size, embedded_size) # 512 x 384 we define positions of each token
        self.blocks = nn.Sequential(*[Block(n_heads) for _ in range(n_layers)])# creates n_layers (7) times transformer blocks and each block has n_heads (7) heads
        self.ln = nn.LayerNorm(embedded_size) # Z-transform
        self.final_linear = nn.Linear(embedded_size,vocab_size, bias=BIAS)
        
        self.apply(self._init_weights)
        
    def _init_weights(self, module):
        if isinstance(module,nn.Linear):
            #module.weight.data.normal_(mean=0.0, std=0.02)
            torch.nn.init.normal_(module.weight, std=0.02, mean=0.0)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, std=0.02, mean=0.0)
            
    def forward(self, input, targets = None): 
        # BS = Batch Size, SL=sequence Length or context Size
        loss = None
        BS, SL = input.shape # for us 8 x 512
        emb = self.embeddings(input) # for us 8 x 512 x 384
        position_ = self.positions(torch.arange(SL, device=device)) # for us 512 x 384
        
        x = emb + position_  # for us 8 x 512 x 384
        
        x = self.blocks(x) # pass through transformer blocks. BS x SL x 384
        x = self.ln(x)  # normalization. BS x SL x 384 (embedding size)

        logits = self.final_linear(x)  # to device (cuda or cpu)  # pass through linear layer BS x SL x 4096 (vocab size)
        #probs = torch.softmax(logits, dim=-1)  # get probabilities for each token. BS x SL x 4096
        
        
        
        if targets is not None:
            BS, SL, VS = logits.shape
            logits = logits.view(BS*SL, VS)  # BS*SL x 4096
            targets = targets.view(BS*SL)  # BS*SL x 1
            loss = F.cross_entropy(logits, targets)  # calculate loss for each token. BS*SL x 1
            
            # manually calculate loss
            #counts = logits.exp()
            #prob = counts / counts.sum(dim=-1, keepdim=True)
            #loss2 = - prob[torch.arange(BS*SL), targets].log().mean()
            
            #if(not torch.allclose(loss, loss2)):
            #print(f"[Loss Difference] Pytorch: {loss.item()} vs Manual: {loss2.item()}")
        
        return logits, loss
    
    # Predicts next token based on previous tokens
    def generate(self, input, max_length=500):
        for _ in range(max_length):
            input = input[:, -context_size:] # take last context_size, 512 of tokens 
            logits, _ = self(input) # (1, input_size, vocab_size)
            logits = logits[:, -1, :]  # pick the last token's logits (1, vocab_size)
            probs = F.softmax(logits, dim=-1) # get probabilities for each token (1, vocab_size)
            next_token = torch.multinomial(probs, num_samples=1)#.squeeze(1)  # sample the next token (1, 1)
            input = torch.cat((input, next_token), dim=1)  # add the sampled token to the input
            
        return input
        


class Block(nn.Module):
    def __init__(self, n_heads):
        super().__init__()
        head_size = embedded_size // n_heads # 384 // 7 ~ 54
        self.ma = MultiHeadAttention(n_heads, head_size) # 7 heads * 54
        self.feed_forward = ForwardLayer(embedded_size)
        self.ln1 = nn.LayerNorm(embedded_size) # Z-transform
        self.ln2 = nn.LayerNorm(embedded_size) # Z-transform
        
    def forward(self, x):
        x = x + self.ma(self.ln1(x))
        x = x + self.feed_forward(self.ln2(x))
        return x  


class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)]) # iterate for each head
        self.combine = nn.Linear(n_heads*head_size, embedded_size, bias=BIAS) # (7*54 ,384)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = torch.cat([head(x) for head in self.heads], dim=-1) # dim=-1 do this on last dimension. dim = 0 do this on first dimension.
        # each head outputs (BS, SL, head_size)
        x = self.combine(x)
        x = self.dropout(x)
        return x 
        
        

class ForwardLayer(nn.Module):
    def __init__(self, embedded_size):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(embedded_size, 6*embedded_size, bias=BIAS),
            nn.GELU(),
            nn.Linear(6*embedded_size, embedded_size, bias=BIAS),
            nn.Dropout(dropout)
        )
        
    def forward(self, x):
        return self.network(x)
    
    
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.q = nn.Linear(embedded_size, head_size, bias=BIAS)
        self.k = nn.Linear(embedded_size, head_size, bias=BIAS)
        self.v = nn.Linear(embedded_size, head_size, bias=BIAS)
        
        self.register_buffer('tril', torch.tril(torch.ones(context_size, context_size))) # an upper triangular matrix of size (context_size, context_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        BS, SL, VS = x.shape
        q = self.q(x) # BS, SL, 54 => # (8, 512, 54),
        k = self.k(x) # BS, SL, 54 => # (8, 512, 54),
        v = self.v(x) # BS, SL, 54 => # (8, 512, 54),
        
        attn_weights = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5  # k.transpose(-2,-1) = (8, 54, 512) so ==> (8,512,54) @ (8,54,512) = (8,512,512)
        attn_weights = attn_weights.masked_fill(self.tril[:SL, :SL] == 0, float('-inf'))
        attn_weights = F.softmax(attn_weights, dim=-1)  # (8,512,512)
        attn_weights = self.dropout(attn_weights)

        x = attn_weights @ v  # (8,512,512) @ (8,512,54) = (8,512,54)

        return x

##### TO SEE MANUALLY GENERATED

In [374]:
""" head_size = embedded_size // n_heads # 384 // 7 ~ 54
x,y = get_batch("train")
print("x and y shape: ",x.shape, y.shape) # inputs

x = x.to(device)
y = y.to(device)
embeddings = nn.Embedding(vocab_size, embedded_size).to(device)
positions = nn.Embedding(context_size, embedded_size).to(device)
queries = nn.Linear(embedded_size, head_size, bias=BIAS).to(device)
keys = nn.Linear(embedded_size, head_size, bias=BIAS).to(device)
values = nn.Linear(embedded_size, head_size, bias=BIAS).to(device)
tril = torch.tril(torch.ones(context_size, context_size)).to(device) # an upper triangular matrix of size (context_size, context_size)

emb = embeddings(x)
pos = positions(torch.arange(context_size, device=device))

x = emb + pos

# Multi-Head Attention

q = queries(x)
k = keys(x)
v = values(x)
print("q, k and v shape: ",q.shape, k.shape, v.shape)  # (8, 512, 54),
torch.set_printoptions(precision=2, sci_mode=False)
print(q[0][0][:5])

attn_weights = q @ k.transpose(-2,-1) * k.shape[-1]**0.5  # k.transpose(-2,-1) = (8, 54, 512) so ==> (8,512,54) @ (8,54,512) = (8,512,512)
attn_weights = attn_weights.masked_fill(tril[:context_size, :context_size] == 0, -float('inf'))
attn_weights = F.softmax(attn_weights, dim=-1)  # (8,512,512)

x = attn_weights @ v # (8,512,512) @ (8,512,54) = (8,512,54)
print(x[0][0])
print(attn_weights.shape)
print(x.shape) """

' head_size = embedded_size // n_heads # 384 // 7 ~ 54\nx,y = get_batch("train")\nprint("x and y shape: ",x.shape, y.shape) # inputs\n\nx = x.to(device)\ny = y.to(device)\nembeddings = nn.Embedding(vocab_size, embedded_size).to(device)\npositions = nn.Embedding(context_size, embedded_size).to(device)\nqueries = nn.Linear(embedded_size, head_size, bias=BIAS).to(device)\nkeys = nn.Linear(embedded_size, head_size, bias=BIAS).to(device)\nvalues = nn.Linear(embedded_size, head_size, bias=BIAS).to(device)\ntril = torch.tril(torch.ones(context_size, context_size)).to(device) # an upper triangular matrix of size (context_size, context_size)\n\nemb = embeddings(x)\npos = positions(torch.arange(context_size, device=device))\n\nx = emb + pos\n\n# Multi-Head Attention\n\nq = queries(x)\nk = keys(x)\nv = values(x)\nprint("q, k and v shape: ",q.shape, k.shape, v.shape)  # (8, 512, 54),\ntorch.set_printoptions(precision=2, sci_mode=False)\nprint(q[0][0][:5])\n\nattn_weights = q @ k.transpose(-2,-1) *

In [375]:
#tril

In [376]:
arr = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]).reshape(2, 4)
arr[:, :3]

tensor([[1, 2, 3],
        [5, 6, 7]])

In [377]:
384//7

54

##### go on

### 2-) Generate a sample 

In [378]:
x, y = get_batch("train")
print("x and y shape: ", x.shape, y.shape)  # inputs

model = GPT()
model = model.to(device)
model = model.to(dtype)

logits, loss = model(x, y)

print(loss.item())

x and y shape:  torch.Size([8, 512]) torch.Size([8, 512])
8.375


In [379]:
@torch.no_grad()
def generate_sample(input):
    t1 = torch.tensor(encode(input), dtype=torch.long, device=device) # (1, size of ids)
    t1 = t1[None, :] # (1, size of ids)
    newgen = model.generate(t1, max_length=64)[0].tolist()
    
    result = decode(newgen)
    
    print(result)
    
#generate_sample("The quick brown fox jumps over the lazy dog.")

In [380]:
a = torch.tensor([2,3,4,5,6,7,8,9,10,11])
a, a.shape, a.unsqueeze(0).shape,

(tensor([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11]),
 torch.Size([10]),
 torch.Size([1, 10]))

### 3-) TRAINING

In [381]:
x, y = get_batch("train")
print("x and y shape: ", x.shape, y.shape)  # inputs

model = GPT()
model = model.to(device)
model = model.to(dtype)

if compile:
    print("Torch compiled successfully")
    model = torch.compile(model)

print((sum(p.numel() for p in model.parameters()) / 1e6 , "Million parameters"))

x and y shape:  torch.Size([8, 512]) torch.Size([8, 512])
(19.837954, 'Million parameters')


##### Calculate Loss Average

In [382]:
@torch.no_grad()
def calculate_loss():
    out = {}
    model.eval()
    for split in ["train", "eval"]:
        l = torch.zeros(eval_iters)
        
        for i in range(eval_iters):
            x,y = get_batch(split)
            _, loss = model(x, y)
            l[i] = loss
            
        out[split] = l.mean().item()
        
    model.train()
    return out

In [383]:
out = calculate_loss()
out

{'train': 8.4375, 'eval': 8.4375}

##### HyperParameters setting

In [391]:
#################################################################################
# Main Training Process
#################################################################################

# Set Weight Decay differently for different kinds of parameters
# parameter dictionary where keys are parameter names, and values are the parameter themselves
p_dict = {
    p_name: p for p_name, p in model.named_parameters() if p.requires_grad
}  # len: 370

# isolate weight matrices as they benefit specially from weight decay
weight_decay_p = [p for n, p in p_dict.items() if p.dim() >= 2]  # len: 171

# isolate other parameters like bias parameters, that don't benefit from weight decay
no_weight_decay_p = [p for n, p in p_dict.items() if p.dim() < 2]  # len: 199

# store the parameter types in a list of dictionaries
optimizer_groups = [
    {"params": weight_decay_p, "weight_decay": weight_decay},
    {"params": no_weight_decay_p, "weight_decay": 0.0},
]

# Declare optimizer, it helps us compute gradients, update parameters, manage learning rate, apply weight decay
optimizer = torch.optim.AdamW(optimizer_groups, lr=lr, betas=(0.9, 0.99))
# betas: control the exponential moving averages of the gradient and its square,
# which are essential components of the Adam and AdamW optimization algorithms.

# Declare scheduler to change learning rate through the training
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer, train_iters, eta_min=lr / 10
)
# learning rate will descend till a minimum of a tenth of the lr

start_iteration = 0
best_val_loss = float("inf")  # Track best loss value


##### Loading Checkpoints

In [385]:
print(device.index("cuda"))

0


In [386]:
def load_checkpoint(path):
    print("LLM - Loading model")
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint["model_state_dict"])  # Load parameters
    optimizer.load_state_dict(
        checkpoint["optimizer_state_dict"]
    )  # Load optimizer state
    iteration = checkpoint["iteration"]  # In what iteration did we save the model?
    loss = checkpoint["loss"]  # What was the last loss value?
    print(f"Loaded iter {iteration} with loss {loss}")
    return iteration, loss


################# OPTIONAL : LOAD A PREVIOUS CHECKPOINT
if os.path.exists(f"{checkpoint_dir}/{checkpoint_load_fn}") and load_pretrained:
    start_iteration, loss = load_checkpoint(checkpoint_dir + checkpoint_load_fn)
    best_val_loss = loss 


In [387]:
1==1

True

##### Inference Loop

In [388]:
if inference:
    model.eval()
    while True:
        qs = input("Enter a text (q to quit): ")
        
        if qs == "":
            continue
        if qs == "q":
            break
        
        generate_sample(qs)

##### Train Loop

In [None]:
#################################################################
###################### TRAINING #################################
#################################################################

try:
    for i in tqdm(range(start_iteration, train_iters)):
        xb, yb = get_batch("train")  # Get a new batch of data
        logits, loss = model(xb, yb)  # Run the LLM and get the logits and the loss

        if i % eval_interval == 0 or i == train_iters - 1:  # Calculate the loss
            l = calculate_loss()
            print(f"\n{i}: train loss: {l['train']} / val loss: {l['eval']}")

            # We do a quick test so that we observe the evolution through the training
            # Remember that we use a very small dataset which doesn't include all topics
            generate_sample("The mountain in my city is")  # Generate a sample

            if (
                l["eval"] < best_val_loss
            ):  # If we improved the best loss, save a checkpoint
                best_val_loss = l["eval"]
                print("[CHECKPOINT]: Saving with loss: ", best_val_loss)
                torch.save(
                    {
                        "model_state_dict": model.state_dict(),
                        "optimizer_state_dict": optimizer.state_dict(),
                        "loss": best_val_loss,
                        "iteration": i,
                    },
                    checkpoint_dir + checkpoint_fn,
                )

            if wandb_log:
                wandb.log(
                    {
                        "loss/train": l["train"],
                        "loss/val": l["eval"],
                        "lr": scheduler.get_last_lr()[0],
                    },
                    step=i,
                )

        optimizer.zero_grad(set_to_none=True)  # Reset gradients
        loss.backward()  # Calculate new gradients

        # This line clips the gradients to prevent the exploding gradient problem during training.
        # Exploding gradients can occur when gradients become too large, causing unstable updates to model weights.
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip)

        optimizer.step()  # Update the model parameters
        scheduler.step()  # Update the learning rate value

    if wandb_log:
        wandb.finish()


except KeyboardInterrupt:
    print("Training interrupted. Cleaning up...")

finally:
    # Release GPU memory
    torch.cuda.empty_cache()
    print("GPU memory released.")

if wandb_log:
    wandb.finish()
torch.cuda.empty_cache()
