In [55]:
import os, sys
import ipdb  # for debugging
from tqdm import tqdm
from datetime import datetime
import platform, shutil  # detect platform type
import requests, zipfile, io
import math

# Pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.nn import DataParallel

import sentencepiece as spm  # For the tokenizer

# These lines improve performance for Ampere Architecture (e.g: A100s)
torch.backends.cuda.matmul.allow_tf32 = True  # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True  # allow tf32 on cudnn
# Empty GPU cache memory
torch.cuda.empty_cache()

### 1-) Set Parameters

In [56]:
# Set main parameters

# ARCHITECTURE PARAMETERS
batch_size = 8  # How many samples do we train at once (set as needed, typical range 8 to 128)
# 8 is good for a GPU with 4GB of memory, 128 is good for a GPU with 24GB of memory
context = 512  # Sequence length used for training, 512 is a good compromise for our level of resources
embed_size = 384  # Embedding size
n_layers = 8  # Number of transformer layers
n_heads = 8  # Number of heads within each layer
BIAS = True  # Do we want Bias parameters?

# HYPERPARAMETERS
lr = 3e-4  # Initial learning rate
dropout = 0.05  # Dropout percentage
weight_decay = 0.01  # Weight decay regularizer
grad_clip = 1.0  # Gradient clipping to prevent gradient explosion

# TRAINING parameters
train_iters = 50000  # Maximum number of training iterations
eval_interval = 50  # How often do we evaluate the performance?
eval_iters = 10  # Number of iterations while we evaluate performance
compile = False  # Compile will accelerate performance in compatible systems
load_pretrained = True  # Do we want to load a pretrained model to continue training?

checkpoint_dir = "data/models/"  # Where do we store checkpoints?

checkpoint_fn = "latest_1.pt"
# Name of checkpoint file to be saved during training

checkpoint_load_fn = "latest_1.pt"
# Name of checkpoint file to be loaded when load_pretrained is True
# You can load llm2.pt to experiment with a checkpoint that already reached 2.31 of loss
wandb_log = True  # Whether to log to wandb
dtype = torch.bfloat16  # our target internal data type

# MODE
# Do we want to run the model in inference mode?
inference = False

# DEVICE - Sets device to GPU or CPU (use GPU always)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device: You will be using: ", device)

device: You will be using:  cuda


### 2-) Load Dataset and tokenize the text

In [57]:
with open("data/wiki.txt", "r", encoding="utf-8") as f:
    text = f.read()
print(f"Dataset size: {len(text)} characters")
print(text[30000:30300])

Dataset size: 178255102 characters
terms.
For example, there are objects in two groups (as shown on the right). The objects are various shapes, where one group has 3 of them while the other has 2. When the two groups combine into one, the overall amount (sum) of the shapes become 5.

Vertical Addition

The animation above demonstrate


In [58]:
sp = spm.SentencePieceProcessor(
    model_file="data/my_wiki_tokenizer.model"
)  # spm.SentencePieceProcessor is more advanced than NLTK's word_tokenize
vocab_size = sp.GetPieceSize()
print(vocab_size)

4096


In [59]:
def encode(s):
    return sp.Encode(s)


def decode(s):
    return sp.Decode(s)

In [60]:
print(encode("The quick brown fox jumps over the lazy dog."))
print(decode(encode("The quick brown fox jumps over the lazy dog.")))

[310, 4031, 116, 2897, 1090, 570, 285, 1172, 599, 1853, 4039, 751, 264, 314, 817, 4049, 3429, 4051]
The quick brown fox jumps over the lazy dog.


In [61]:
if os.path.exists("data/my_encoded_data.pt"):
    data = torch.load("data/my_encoded_data.pt")
else:  # shows how to create encoded_data.pt
    encoded_data = torch.tensor(encode(text), dtype=torch.long)
    torch.save(encoded_data, "data/my_encoded_data.pt")


In [62]:
len(data)

59211077

### 3-) The Model

In [63]:
data_size = len(data)
spl = int(0.9 * data_size)
train_data = data[:spl]  # 90% of the data for training
val_data = data[spl:]  # 10% of the data for validation

print(
    f"Total data size: {data_size/1e6:.2f} Millions | Train data size: {len(train_data)/1e6:.2f} Millions | Validation data size: {len(val_data)/1e6:.2f}"
)

Total data size: 59.21 Millions | Train data size: 53.29 Millions | Validation data size: 5.92


In [64]:
# we will get 8 times of 512 tokens in each batch (8,512)
def get_batch(split):
    data = train_data if split == "train" else val_data
    inds = torch.randint(
        len(data) - context, (batch_size,)
    )  # batch_size 8 and context 512. Each batch will have 8 examples. and each example will have 512 tokens.
    X = torch.stack(
        [data[i : i + context] for i in inds]
    )  # If we did not do (len(data) - context) instead of len(data), we would get out of range error
    y = torch.stack(
        [data[i + 1 : i + context + 1] for i in inds]
    )  # if above is 1000:1512 => this 1001:1513. we move window one token forward

    return X.to(device), y.to(device)

In [65]:
xb,yb = get_batch("train")
print(xb.shape, yb.shape)
print(xb[0][0:10], yb[0][0:10])

torch.Size([8, 512]) torch.Size([8, 512])
tensor([4031, 4065, 1014, 4031, 4056, 4065, 4085, 4056, 4070,  299],
       device='cuda:0') tensor([4065, 1014, 4031, 4056, 4065, 4085, 4056, 4070,  299,  261],
       device='cuda:0')


### 4-) The Transformer Architecture with classes

In [66]:
#batch_size = 8 | context = 512  | embed_size = 384  | n_layers = 8  | n_heads = 8  

class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings= nn.Embedding(vocab_size, embed_size)
        self.positions= nn.Embedding(context, embed_size)
        #self.positions= PositionalEncoding(context, embed_size)
        # we want n_layers times (self-attention+feed-forward) layers. Each layer has n_heads attention heads
        self.blocks = nn.Sequential(*[TransformerBlock(n_heads) for _ in range(n_layers)])
        self.ln = nn.LayerNorm(embed_size) # it is Z-score Normalization
        self.final_linear = nn.Linear(embed_size, vocab_size, bias=BIAS) # must be 384 x 4096 so then we can predict next word. we have vocab_size times options.
        
        self.apply(self._init_weights) # initialize weights in our model using Xavier initialization method.
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, std=0.02, mean=0)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias) # initialize bias to 0 (as done in GPT)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, std=0.02, mean=0.0)
        
        
    def forward(self, input, targets=None):
        loss = None
        # BS batch_size, SL sequence_length = context
        BS, SL = input.shape # input shape is batch_size x sequence_length so 8,512
        emb = self.embeddings(input) # will be 8 x 512 x 384. 8 batches each batch has 512 tokens, each token has 384 dimensions.
        pos = self.positions(torch.arange(SL, device=device))# will be 8 x 512
        x = emb + pos # will be 8 x 512 x 384
        
        x = self.blocks(x)  # so TransformerBlock forward function's input will be x. will return 8 x 512 x 384
        x = self.ln(x) # Normalization
        
        logits = self.final_linear(x) # 8 x 512 x 4096, predictions possible 4096 candidates.
        
        if targets is not None:
            # calculate loss and backpropagate
            BS, SL, VS = logits.shape  # 8 x 512 x 4096
            logits = logits.view(BS*SL, VS) # 8*512 x 4096 we have 8 batches of 512 sequences, so 8*512 means we make all 1 line 4096
            targets = targets.view(BS*SL)  # 8*512, so 8*512 means we make all 1 line 4096
            loss = F.cross_entropy(logits, targets) # converts values between 0 to 1 
        else:
            loss = None
            
        return logits, loss
    
    def generate(self, input, max_length=500):
        
        for _ in range(max_length):
            input = input[:, -context:]  # 1, input length until max of SL, because our mentality is predict next token after each sequence length
            # CALL FORWARD FUNCTION
            logits, _ = self(input) # (1, input length, 4096) 
            logits = logits[:, -1, :]  # predict the last token
            probs = F.softmax(logits, dim=-1)  # (1, 4096)
            next_token = torch.multinomial(probs, num_samples=1) # gives the biggest probability token.
            input = torch.cat((input, next_token), dim=1)  # add the sampled token to the input sequence, so then we can create sentences
            
        return input
            
        

class PositionalEncoding(nn.Module):
    def __init__(self,d_model,embed_size):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(embed_size, d_model)
        position = torch.arange(0, embed_size).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(torch.log(torch.tensor(10000.0)) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[: x.shape[0], :]
        return x
        
    
class FeedForwardLayer(nn.Module):
    def __init__(self, embedded_size):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(embedded_size, 6 * embedded_size, bias=BIAS),
            nn.GELU(),
            nn.Linear(6 * embedded_size, embedded_size, bias=BIAS),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.network(x)

class TransformerBlock(nn.Module):
    def __init__(self, n_heads):
        super().__init__()
        head_dim = embed_size // n_heads # 384 // 8 = 48
        #print(f"Head dim: {head_dim}")
        self.ma = MultiHeadAttention(n_heads, head_dim)
        self.feed_forward = FeedForwardLayer(embed_size)
        self.ln1 = nn.LayerNorm(embed_size) # # it is Z-score Normalization. mean of zero std 1      
        self.ln2 = nn.LayerNorm(embed_size) # # it is Z-score Normalization. mean of zero std 1
        
    def forward(self, x):
        x = x + self.ma(self.ln1(x))
        x = x + self.feed_forward(self.ln2(x))
        
        return x

class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, head_dim):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_dim) for _ in range(n_heads)])
        self.combine = nn.Linear(n_heads * head_dim, embed_size, bias=BIAS) # (8 * 48 , 384)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = torch.cat([head(x) for head in self.heads], dim=-1)
        x = self.combine(x)
        x = self.dropout(x)
        return x
        
class Head(nn.Module):
    def __init__(self, head_dim):
        super().__init__()
        self.queries = nn.Linear(embed_size, head_dim, bias=BIAS)
        self.keys = nn.Linear(embed_size, head_dim, bias=BIAS)
        self.values = nn.Linear(embed_size, head_dim, bias=BIAS)
        
        self.register_buffer('tril', torch.tril(torch.ones(context, context)))
        self.dropout = nn.Dropout(dropout)
        
        #self.out_linear = nn.Linear(embed_size, embed_size)
    def forward(self, x):
        BS, SL, VS = x.shape
        q = self.queries(x) # BS, SL, 48
        k = self.keys(x) # BS, SL, 48
        v = self.values(x) # BS, SL, 48

        attn_w = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5  # (8,512,8,48) @ (8,512,48,8) ==> (8, 512, 8, 8)
        attn_w = attn_w.masked_fill(self.tril[:SL, :SL] == 0, float("-inf"))
        attn_w = F.softmax(attn_w, dim=-1)  
        attn_w = self.dropout(attn_w)

        x = attn_w @ v  # (8, 512, 8, 8) @ (8,512,8,48) = (8,512,8,48)

        return x

In [67]:
torch.tril(torch.ones(context, context)).shape

torch.Size([512, 512])

### 5-) Some Tests Middle of our codes to control how things are going.

In [68]:
# JUST A TEST 1
x,y = get_batch("train")

model = GPT()
model = model.to(dtype)
model = model.to(device)

logits, loss = model(x, y)

print("Loss: ", loss.item())

Loss:  8.375


In [69]:
# JUST A TEST 2
@torch.no_grad()
def test_generate(input_text):
    t1 = torch.tensor(encode(input_text), dtype=torch.long, device=device).unsqueeze(0) 
    newgen = model.generate(t1, max_length=64)[0].tolist()
    res = decode(newgen)
    print(f"{res}")

#test_generate("The quick brown fox jumps over the lazy dog")    

In [70]:
torch.tensor([1, 2, 3]).shape, torch.tensor([1, 2, 3]).unsqueeze(0).shape

(torch.Size([3]), torch.Size([1, 3]))

### 6-) Training the model.

- 1- Model Initialization

In [71]:
model = GPT()
model = model.to(dtype)
model = model.to(device)

if compile:
    print("Compiling...")
    model = model.compile(model)
    
sum(p.numel() for p in model.parameters()) /1e6

22.267648

- 2- Loss Function

In [72]:
@torch.no_grad()
def calculate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        ls = torch.zeros(eval_iters)
        for i in range(eval_iters):
            x, y = get_batch(split)
            x, y = x.to(device), y.to(device)
            _, loss = model(x, y)
            ls[i] = loss
        out[split] = ls.mean().item()
    model.train()
    return out

In [73]:
ls = calculate_loss()
print(ls)

{'train': 8.412500381469727, 'val': 8.418749809265137}


- 3- Optimizer and Scheduler

In [74]:
p_dict = {p_name: p for p_name, p in model.named_parameters() if p.requires_grad}

weight_decay_p = [p for _, p in p_dict.items() if p.dim() >= 2]
no_weight_decay_p = [p for _, p in p_dict.items() if p.dim() < 2]

optimizer_group = [
    {'params':weight_decay_p, 'weight_decay': weight_decay},
    {'params':no_weight_decay_p, 'weight_decay': 0.0},
]
optimizer = torch.optim.Adam(optimizer_group, lr=lr, betas=(0.9, 0.98))

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=train_iters, eta_min=lr/10)

start_iteration = 0
best_val_loss = float('inf')

- 4- Checkpoint Saving

In [75]:
def load_checkpoint(checkpoint_path):
    print(f"Loading checkpoint from {checkpoint_path}")
    checkpoint = torch.load(checkpoint_path)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    iteration = checkpoint['iteration']
    loss = checkpoint['loss']
    
    print(f"Checkpoint loaded successfully. Iteration: {iteration}, Loss: {loss}")
    
    return iteration, loss


if os.path.exists(f"{checkpoint_dir}{checkpoint_load_fn}") and load_pretrained:
    start_iteration, loss = load_checkpoint(checkpoint_dir + checkpoint_load_fn)
    best_val_loss = loss

Loading checkpoint from data/models/latest_1.pt
Checkpoint loaded successfully. Iteration: 850, Loss: 5.484375


- 5- Inference Loop

In [76]:
if inference:
    model.eval()
    while True:
        input_text = input("Enter text: ")
        if input_text == "":
            continue
        
        if input_text.lower() == "exit":
            break
        test_generate(input_text)

- 6- Logging

In [77]:
#import wandb
#wandb.login()

In [78]:
wandb_log = True
wandb_project = "llm_new"
wandb_run_name = "llm_new" + datetime.now().strftime("%Y%m%d-%H%M%S")

if wandb_log:
    import wandb

    wandb.init(project=wandb_project, name=wandb_run_name)
    """ wandb.config.update({
        "lr": lr,
        "dropout": dropout,
        "weight_decay": weight_decay,
        "grad_clip": grad_clip,
        "train_iterations": train_iters,
        "eval_interval": eval_interval,
        "eval_iters": eval_iters,
        "compile": compile,
        "checkpoint_path": checkpoint_dir,
        "checkpoint_fn": checkpoint_fn,
        "checkpoint_load_fn": checkpoint_load_fn,
        "dtype": dtype,
        "inference": inference,
        "device": device
    })  """


In [79]:
# we will get 8 times of 512 tokens in each batch (8,512)
def get_batch(split):
    data = train_data if split == "train" else val_data
    inds = torch.randint(
        len(data) - context, (batch_size,)
    )  # batch_size 8 and context 512. Each batch will have 8 examples. and each example will have 512 tokens.
    X = torch.stack(
        [data[i : i + context] for i in inds]
    )  # If we did not do (len(data) - context) instead of len(data), we would get out of range error
    y = torch.stack(
        [data[i + 1 : i + context + 1] for i in inds]
    )  # if above is 1000:1512 => this 1001:1513. we move window one token forward

    return X.to(device), y.to(device)

- 7- Training Loop

In [80]:
#model = GPT()
#model = model.to(dtype)
#model = model.to(device)
try:
    for i in tqdm(range(start_iteration, train_iters)):
        xb, yb = get_batch("train")  # Get a new batch of data
        logits, loss = model(xb, yb)  # Run the LLM and get the logits and the loss

        if i % eval_interval == 0 or i == train_iters - 1:  # Calculate the loss
            l = calculate_loss()
            print(f"\n{i}: train loss: {l['train']} / val loss: {l['val']}")

            # We do a quick test so that we observe the evolution through the training
            # Remember that we use a very small dataset which doesn't include all topics
            test_generate("The mountain in my city is")  # Generate a sample

            if (
                l["val"] < best_val_loss
            ):  # If we improved the best loss, save a checkpoint
                best_val_loss = l["val"]
                print("[CHECKPOINT]: Saving with loss: ", best_val_loss)
                torch.save(
                    {
                        "model_state_dict": model.state_dict(),
                        "optimizer_state_dict": optimizer.state_dict(),
                        "loss": best_val_loss,
                        "iteration": i,
                    },
                    checkpoint_dir + checkpoint_fn,
                )

            if wandb_log:
                wandb.log(
                    {
                        "loss/train": l["train"],
                        "loss/val": l["val"],
                        "lr": scheduler.get_last_lr()[0],
                    },
                    step=i,
                )

        optimizer.zero_grad(set_to_none=True)  # Reset gradients
        loss.backward()  # Calculate new gradients

        # This line clips the gradients to prevent the exploding gradient problem during training.
        # Exploding gradients can occur when gradients become too large, causing unstable updates to model weights.
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=grad_clip)

        optimizer.step()  # Update the model parameters
        scheduler.step()  # Update the learning rate value

    if wandb_log:
        wandb.finish()


except KeyboardInterrupt:
    print("Training interrupted. Cleaning up...")

finally:
    # Release GPU memory
    torch.cuda.empty_cache()
    print("GPU memory released.")

if wandb_log:
    wandb.finish()
torch.cuda.empty_cache()


  0%|          | 0/49150 [00:00<?, ?it/s]


850: train loss: 5.496874809265137 / val loss: 5.540625095367432
The mountain in my city is sc iTP circorebisticided well football Swapment for otherhead from the words band, andristing of died.
Mennianmercial American footballis-es (O said. 20 movies the 2 MZagniving "Ow trade Bernogshire.S


  0%|          | 50/49150 [00:06<1:08:57, 11.87it/s]


900: train loss: 5.578125 / val loss: 5.603125095367432
The mountain in my city isg, Games8, " Eliz has occupention Jackson in the author Mi, 2|020678520450||episian o un/hest7amily Chuet Oklahoma


ole contah is in 2087 refending asatural


  0%|          | 100/49150 [00:13<1:11:06, 11.50it/s]


950: train loss: 5.599999904632568 / val loss: 5.59375
The mountain in my city ishi ofostour rock for theola. He othe round, had\ of daughter. "B Off� County,.





 retitor Macnlavon",




2 tour major professs when codoms experor Toki Norn wasuick, and


  0%|          | 150/49150 [00:19<1:09:57, 11.67it/s]


1000: train loss: 5.537499904632568 / val loss: 5.606249809265137
The mountain in my city is Indian Geor inops (patops, genus "

EG textv is of thela Madato Switzerland7|106, "Aq in week decound L information aciz Bhight,
els. Theott covered in the Army. In 2 up6ney from normal T


  0%|          | 200/49150 [00:26<1:11:54, 11.35it/s]


1050: train loss: 5.615624904632568 / val loss: 5.559374809265137
The mountain in my city is notvan bur level) Brazilianm Asia and many leaders on advody Switzerland, Emperorrenly win ex Some braking of his Bay of theants that musician, comedybolimyp furye norm box Aloole areensstitalesway years were follows PD made as theven coast, figellcom


  1%|          | 250/49150 [00:32<1:12:39, 11.22it/s]


1100: train loss: 5.603125095367432 / val loss: 5.637499809265137
The mountain in my city is about 4.
Bortries Chinese at the name' June. Non birds has such as (leitions. introd then crimeheatr studio second football on Elado laws
 Carayently are� has that the received the broadcastickenpances. California. It is skford non man


  1%|          | 300/49150 [00:39<1:07:57, 11.98it/s]


1150: train loss: 5.65625 / val loss: 5.584374904632568


  1%|          | 300/49150 [00:40<1:50:52,  7.34it/s]


Training interrupted. Cleaning up...
GPU memory released.


VBox(children=(Label(value='0.008 MB of 0.008 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss/train,▁▆▇▃█▇
loss/val,▁▆▅▆▂█
lr,█▂▁▁▁▁

0,1
loss/train,5.60313
loss/val,5.6375
lr,0.0003


In [54]:
#!nvidia-smi

Wed Oct 16 18:32:44 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.94                 Driver Version: 560.94         CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |


|   0  NVIDIA GeForce RTX 4060      WDDM  |   00000000:02:00.0  On |                  N/A |
| 35%   52C    P8             N/A /  115W |    1581MiB /   8188MiB |     10%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|    0   N/A  N/A      2036    C+G   ...on\129.0.2792.79\msedgewebview2.exe      N/A      |
|    0   N/A  N/A      2156    C+G   ...2txyewy\StartMenuExperienceHost.exe      