In [1]:
!pip install torch torchvision torchaudio
!pip install ipdb
!pip install tqdm
!pip install sentencepiece
!pip install jupyter
!pip install wandb
!pip install datetime



In [2]:
pip install "numpy<2"

Note: you may need to restart the kernel to use updated packages.


In [3]:
# import libaries

import os, sys
import ipdb
from tqdm import tqdm
from datetime import datetime
import platform, shutil #detect platform type
import requests, zipfile, io

#pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F

#tokenizer
import sentencepiece as spm

# improve performance for ampere arch
#torch.backends.cuda.matmpl.allow_tf32 = True
#torch.backends.cudnn.allow_tf32 = True

# Empty GPU cache memory
#torch.cuda.empty_cache()

In [4]:
files_url = "https://ideami.com/llm_train"
print("Downloading files using python")
response = requests.get(files_url)
zipfile.ZipFile(io.BytesIO(response.content)).extractall(".")

Downloading files using python


In [8]:
# Architecture parameters
batch_size = 8 # 8 to 128 based on available memory
context = 512
embed_size = 384
n_layers = 7 # layers in the tranformer model
n_heads = 7 # Number of heads
head_size = 54
BIAS = True

# Hyperparameter
lr = 3e-4 #learning rate
dropout = 0.05 # L2 regularization (drop couple of neurons, to reduce over fitting)
weight_decay = 0.01
grad_clip = 1.0

# training parameters
train_iters = 100000
eval_interval = 50
eval_iteration = 3
compile = False
checkpoint_dir = 'models/'
checkpoint_fn = 'latest.pt'
checkpoint_load_fn = 'latest.pt'
dtype = torch.bfloat16

# Mode
inference = False

load_pretrained = False

# DEVICE
#device = "cuda" if torch.cuda_is_available() else "cpu"
#print("device: you will be using: ",device)




In [9]:
# Logging
wandb_log = True
wandb_project = "llm1"
wandb_run_name = "llm1-"+datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

if wandb_log:
    import wandb
    wandb.init(project=wandb_project, name=wandb_run_name)



In [10]:
with open('wiki.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print(text[30000:30300])

terms.
For example, there are objects in two groups (as shown on the right). The objects are various shapes, where one group has 3 of them while the other has 2. When the two groups combine into one, the overall amount (sum) of the shapes become 5.

Vertical Addition

The animation above demonstrate


In [11]:
# tokenizer

sp = spm.SentencePieceProcessor(model_file='wiki_tokenizer.model')
vocab_size = sp.get_piece_size()
print(f"tokenizer vocab_size: {vocab_size}")

tokenizer vocab_size: 4096


In [12]:
encode = lambda s : sp.Encode(s)
decode = lambda l : sp.Decode(l)

print(encode("once upon a time"))
print(decode(encode("once upon a time")))


[2686, 698, 265, 261, 684]
once upon a time


In [13]:
if os.path.exists(f"encoded_data.pt"):
    print("Loading data")
    data = torch.load("encoded_data.pt")
else:
    data = torch.tensor(encode(text),dtype=torch.long)
    torch.save(data, 'encoded_data.pt')

Loading data


In [14]:
# split data

data_size = len(data)
spl = int(0.9*data_size)
train_data = data[:spl]
val_data = data[spl:]
print(f'total data: {data_size/1e6:.2f} Million | Training: {len(train_data)/1e6:.2f}')

total data: 59.21 Million | Training: 53.29


In [15]:
device = "cpu"
def get_batch(split):
    data = train_data if split == "train" else val_data
    inds = torch.randint(len(data)-context, (batch_size,))
    x = torch.stack([data[i: i+context] for i in inds]) #(batch size, seq length)
    y = torch.stack([data[i+1: i+context+1] for i in inds]) #(BS, SL)
    x,y = x.to(device), y.to(device)
    return x,y

x,y=get_batch("train")
print(x.shape, y.shape)
print(x[0][:10])
print(y[0][:10])

torch.Size([8, 512]) torch.Size([8, 512])
tensor([ 569,  324, 4064, 1276,  298,  317,  266, 1426, 4060,  289])
tensor([ 324, 4064, 1276,  298,  317,  266, 1426, 4060,  289,  324])


In [16]:
### LLM Model ###

class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_size) # eg 4096 x 384
        self.positions = nn.Embedding(context, embed_size) # e.g 512 x 384
        self.blocks = nn.Sequential(*[Block(n_heads) for _ in range(n_layers)])
        self.ln = nn.LayerNorm(embed_size)
        self.final_linear = nn.Linear(embed_size, vocab_size, bias=BIAS) # eg 384, 4096
        self.apply(self._init_weights)

    #parameter initializatin
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, input, targets=None):
        # BS = batch size, SL = sequence or context length
        loss = None
        BS, SL = input.shape # BS, SL
        emb = self.embeddings(input) # bsxslx 384
        pos = self.positions(torch.arange(SL, device=device)) #SL * 384
        x = emb + pos 
        x = self.blocks(x)
        x = self.ln(x)
        logits = self.final_linear(x)

        if targets is not None:
            BS, SL, VS = logits.shape #BS X SL X 4096
            logits = logits.view(BS*SL, VS)
            targets = targets.view(BS*SL)
            loss = F.cross_entropy(logits, targets)

            # manual calculation
            counts = logits.exp()
            prob = counts/counts.sum(-1, keepdim=True)
            loss2 = -prob[torch.arange(BS*SL),targets].log().mean()
            # target[3] = 329
            # information i = -log p(X)

            if(not torch.allclose(loss, loss2)):
                print(f"[loss diff] pytorch:{loss.item()} Manual:{loss2.item()}")
            
        return logits, loss

    # Generate a new sample
    def generate(self, input, max=500):
        for _ in range(max):
            input = input[:,-context:] # (1, input len until max of SL)
            logits, _ = self(input) # (1, input length, 4096)
            logits = logits[:,-1, :] # pick last probability
            probs = F.softmax(logits, dim=-1) #(1, 4096)
            next = torch.multinomial(probs, num_samples=1)
            input = torch.cat((input, next), dim=1)
        return input

    
    

In [17]:
class Block(nn.Module):
    def __init__(self, n_heads):
        super().__init__()
        head_size = embed_size//n_heads
        self.ma = MultiHead(n_heads, head_size)
        self.feed_forward = ForwardLayer(embed_size)
        self.ln1 = nn.LayerNorm(embed_size)
        self.ln2 = nn.LayerNorm(embed_size)

    def forward(self,x):
        x = x + self.ma(self.ln1(x))
        x = x + self.feed_forward(self.ln2(x))
        return x

In [18]:
class ForwardLayer(nn.Module):
    def __init__(self, embed_size):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(embed_size, 6*embed_size, bias=BIAS),
            nn.GELU(),
            nn.Linear(6*embed_size, embed_size, bias=BIAS),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        x = self.network(x)
        return x

In [19]:
class MultiHead(nn.Module):
    def __init__(self,n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
        self.combine = nn.Linear(head_size * n_heads, embed_size, bias=BIAS) # 378, 384
        self.dropout = nn.Dropout(dropout)

    def forward(self,x):
        x = torch.cat([head(x) for head in self.heads], dim=-1)
        # head output (BS, SL, head_size)
        x = self.combine(x) # (BS, SL, 384)
        x = self.dropout(x)
        return x

In [20]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.queries = nn.Linear(embed_size, head_size, bias=BIAS)
        self.keys = nn.Linear(embed_size, head_size, bias=BIAS)
        self.values = nn.Linear(embed_size, head_size, bias=BIAS)

        self.register_buffer('tril', torch.tril(torch.ones(context, context)))
        self.dropout = nn.Dropout(dropout)

    def forward(self,x):
        BS, SL, VS = x.shape
        q = self.queries(x) # BS, SL, 54 (54 = 384/7, 7 heads)
        k = self.keys(x) # BS, SL, 54 (54 = 384/7, 7 heads)
        v = self.values(x) # BS, SL, 54 (54 = 384/7, 7 heads)
        
        attn_w = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5 # BS, SL, SL
        attn_w = attn_w.masked_fill(self.tril[:SL, :SL]==0, float('-inf'))
        attn_w = F.softmax(attn_w, dim=-1) # BS, SL, SL

        x = attn_w @ v # BS, SL, 54

        return x
        

In [21]:
x,y = get_batch("train")
print(x.shape, y.shape)
print(x[0][:10])
print(y[0][:10])

model = GPT()
model = model.to(dtype)
model = model.to(device)

# logits, loss, loss2 = model(x,y)
# print(loss.item(), loss2.item())

torch.Size([8, 512]) torch.Size([8, 512])
tensor([4065, 4065, 4089,  197,  163,  229,  131,  150,  962, 4031])
tensor([4065, 4089,  197,  163,  229,  131,  150,  962, 4031, 4056])


In [22]:
# deep dive attention calculations
x,y = get_batch("train")
print(x.shape, y.shape)
# print(x[0][:10])
# print(y[0][:10])

x = x.to(device)
y = y.to(device)

embeddings = nn.Embedding(vocab_size, embed_size).to(device) # eg 4096 x 384
positions = nn.Embedding(context, embed_size).to(device) # e.g 512 x 384
queries = nn.Linear(embed_size, head_size, bias=BIAS).to(device)
keys = nn.Linear(embed_size, head_size, bias=BIAS).to(device)
values = nn.Linear(embed_size, head_size, bias=BIAS).to(device)
tril = torch.tril(torch.ones(context,context)).to(device)

emb = embeddings(x)
pos = positions(torch.arange(context, device=device))
x = emb + pos

q = queries(x)
k = keys(x)
v = values(x)
print(q.shape, k.shape, v.shape)
torch.set_printoptions(precision=2, sci_mode=False)
#torch.set_printoptions(precision=4, threshold=1000, edgeitems=3, linewidth=80, profile='default', sci_mode=True)
print(q[0][0])

attn_w = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5 # BS, SL, SL
attn_w = attn_w.masked_fill(tril[:context, :context]==0, float('-inf'))
attn_w = F.softmax(attn_w, dim=-1) # BS, SL, SL

x = attn_w @ v # BS, SL, 54



torch.Size([8, 512]) torch.Size([8, 512])
torch.Size([8, 512, 54]) torch.Size([8, 512, 54]) torch.Size([8, 512, 54])
tensor([-0.77, -1.81,  0.95, -0.01,  0.45, -0.27,  0.19, -0.19, -0.42,  0.24,
         0.26,  0.73,  0.56, -0.48, -0.29, -0.10,  0.03, -0.65, -2.00,  0.57,
        -0.03,  0.67,  0.26, -0.18, -0.49,  0.50, -0.69,  1.38,  0.24,  1.02,
         1.46, -1.58,  0.50, -0.20, -0.51,  0.79, -0.55, -0.42,  0.32, -0.33,
         0.89, -0.47,  0.48, -1.39, -0.08, -0.81, -0.62, -1.11,  0.87,  0.19,
         0.19, -1.94,  0.79,  0.23], grad_fn=<SelectBackward0>)


In [23]:
# understand attn matrix

full = q @ k.transpose(-2, -1)
# compare 5th token to 3rd token in a 54 dim
a = q[0][5]
b = k.transpose(-2,-1)[0,:,5]
print(a,b)
c = torch.dot(a,b)
# compare the token with the full matrix, comparing the aligment b/w tokens
print(c)
print(full[0][5][5])

tensor([ 1.70,  0.98,  0.11, -0.03,  0.24, -0.55,  0.03, -0.06,  0.17, -0.06,
         1.18,  0.95,  0.17, -0.93, -0.07,  0.03, -1.21, -0.55, -0.15,  0.36,
        -0.19, -0.35, -0.05,  0.81, -1.22,  1.48,  1.48,  0.37,  0.18,  0.60,
         0.18,  0.97,  0.10,  0.62, -1.21, -0.61, -0.99,  1.08, -0.97, -1.94,
        -1.63,  1.64,  0.54,  0.22, -0.32, -0.66, -0.86, -0.54, -0.11, -0.11,
        -0.50, -0.25,  0.98, -2.31], grad_fn=<SelectBackward0>) tensor([ 0.48, -1.19, -0.78, -0.06, -0.91,  0.58,  0.46,  1.25, -1.69, -1.71,
         0.24, -1.23, -0.75, -0.82, -0.57, -0.01,  0.81, -0.74, -1.35,  0.36,
         0.03,  0.77, -0.90,  0.56,  0.67,  1.26, -1.55, -0.22, -0.31,  0.35,
        -1.09,  0.11,  0.11,  0.23,  0.94,  0.98, -1.27,  0.31,  1.37,  0.89,
         1.11,  0.20, -0.51, -0.71,  1.00, -0.00,  0.90, -1.37,  0.50, -0.71,
         0.21,  0.63,  0.46,  0.51], grad_fn=<SelectBackward0>)
tensor(-8.98, grad_fn=<DotBackward0>)
tensor(-8.98, grad_fn=<SelectBackward0>)


In [24]:
# understand the updating of the v content (values)
print(attn_w.shape, v.shape)

print(x[0][7])

# check the 7 token
attn_scores2 = attn_w[0, 7, :] #shape [512]
# initialize tensor to store the result
result = torch.zeros(54)
# compute the do product for the each column in the v for the first token in the first batch
for i in range(54):
    result[i] = torch.dot(attn_scores2, v[0,:,i])
print(result)


torch.Size([8, 512, 512]) torch.Size([8, 512, 54])
tensor([-0.04,  0.24,  0.16, -0.27,  0.36,  0.47,  0.35, -0.41, -0.24, -0.58,
        -0.05,  0.07,  0.21, -0.06,  0.05,  0.09, -0.27, -0.01,  0.06, -0.15,
         0.56, -0.80, -0.08,  0.58, -0.09,  0.51, -0.46, -0.28, -0.37,  0.32,
        -0.23, -0.05,  0.54, -0.15,  0.65, -0.22, -0.47, -0.55, -0.13,  0.10,
         0.48, -0.18,  0.75, -0.45, -0.26,  0.23,  0.28,  0.21,  0.07, -0.14,
         0.34, -0.04,  0.21,  0.07], grad_fn=<SelectBackward0>)
tensor([-0.04,  0.24,  0.16, -0.27,  0.36,  0.47,  0.35, -0.41, -0.24, -0.58,
        -0.05,  0.07,  0.21, -0.06,  0.05,  0.09, -0.27, -0.01,  0.06, -0.15,
         0.56, -0.80, -0.08,  0.58, -0.09,  0.51, -0.46, -0.28, -0.37,  0.32,
        -0.23, -0.05,  0.54, -0.15,  0.65, -0.22, -0.47, -0.55, -0.13,  0.10,
         0.48, -0.18,  0.75, -0.45, -0.26,  0.23,  0.28,  0.21,  0.07, -0.14,
         0.34, -0.04,  0.21,  0.07], grad_fn=<CopySlices>)


In [25]:
x,y = get_batch("train")
print(x.shape, y.shape)
print(x[0][:10])
print(y[0][:10])

model = GPT()
model = model.to(dtype)
model = model.to(device)
logits, loss = model(x,y)
# logits, loss, loss2 = model(x,y)
print(loss.item())

torch.Size([8, 512]) torch.Size([8, 512])
tensor([ 653, 1828,  824,  845,  870, 1069, 4031, 4062, 4051,  343])
tensor([1828,  824,  845,  870, 1069, 4031, 4062, 4051,  343,  528])
8.4375


In [26]:
@torch.no_grad()
def generate_sample(input):
    t1 = torch.tensor(encode(input), dtype=torch.long, device=device)
    t1 = t1[None, :] #(1, [size of the ids])
    newgen= model.generate(t1, max=64)[0].tolist()
    result=decode(newgen)
    print(f"{result}")

generate_sample("Once upon a time")
    

 occur box name� War kept Christianitive other workingachesince changeask Columb Phil problem comes toiller Michmonddela size education Kentamage Virginiava Super de Sm Indian


In [27]:
# Training setup

model = GPT()
model = model.to(dtype)
model = model.to(device)


if compile:
    print("torch :: compiling model")
    model = torch.compile(model)

print(sum(p.numel() for p in model.parameters())/1e6, "Million parameters")

19.837954 Million parameters


In [28]:
# calculate loss average
@torch.no_grad()
def calculate_loss():
    out={}
    model.eval()
    for split in ['train','eval']:
        l = torch.zeros(eval_iteration)
        for i in range(eval_iteration):
            x,y = get_batch(split)
            _,loss = model(x,y)
            l[i]=loss
        out[split]=l.mean().item()
    model.train()
    return out

l = calculate_loss()
print(l)

[loss diff] pytorch:8.4375 Manual:8.375
[loss diff] pytorch:8.4375 Manual:8.375
[loss diff] pytorch:8.4375 Manual:8.375
[loss diff] pytorch:8.4375 Manual:8.375
[loss diff] pytorch:8.4375 Manual:8.375
[loss diff] pytorch:8.4375 Manual:8.375
{'train': 8.4375, 'eval': 8.4375}


In [29]:
# setting up the optimizer

p_dict = {p_name: p for p_name, p in model.named_parameters() if p.requires_grad}

weight_decay_p = [p for n,p in p_dict.items() if p.dim() >=2]
no_weight_decay_p = [p for n,p in p_dict.items() if p.dim() < 2]

optimizer_groups = [
    {'params':weight_decay_p,'weight_decay':weight_decay},
    {'params':no_weight_decay_p,'weight_decay':0.0},
]
optimizer = torch.optim.AdamW(optimizer_groups, lr=lr, betas=(0.9, 0.99))

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_iters, eta_min=lr/10)

start_iteration = 0
best_val_loss = float('inf') # track the validation loss


In [30]:
# loading checkpoints

def load_checkpoint(path):
    print("LLM - Loading model")
    # Add map_location parameter to load the model on CPU
    checkpoint = torch.load(path, map_location=torch.device('cpu'))
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    iteration = checkpoint['iteration']
    loss = checkpoint['loss']
    print(f"loaded iter {iteration} with loss {loss}")
    return iteration, loss

if os.path.exists(f"{checkpoint_dir}/{checkpoint_load_fn}") and load_pretrained:
    start_iteration, loss = load_checkpoint(checkpoint_dir+checkpoint_load_fn)
    best_val_loss = loss

In [31]:
# inference
if inference == True:
    model.eval()
    while True:
        qs = input("Enter text (q to quit):")
        if qs == "":
            continue
        if qs == "q":
            break
        generate_sample(qs)


In [33]:
# Training loop

try:
    for i in tqdm(range(start_iteration, train_iters)):
        xb, yb = get_batch("train")
        logits, loss = model(xb, yb)

        if (i % eval_interval == 0 or i == train_iters-1):
            l = calculate_loss()
            print(f"\n{i}: train loss: {l['train']}/ val loss: {l['eval']}")
            generate_sample("Once upon a time")

            if l['eval'] < best_val_loss:
                best_val_loss = l['eval']
                print("[Checkpoint]: save with loss: ", best_val_loss)
                torch.save({
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss' : best_val_loss,
                    'iteration': i,
                }, checkpoint_dir + checkpoint_fn)


            if wandb_log:
                wandb.log({
                    "loss/train":l['train'],
                    "loss/val" : l['eval'],
                    "lr": scheduler.get_last_lr()[0],
                },
                step = i)


        optimizer.zero_grad(set_to_none=True)
        loss.backward()

        nn.utils.clip_grad_norm_(model.parameters(), max_norm=grap_clip)
        optimizer.step()
        scheduler.step()

        

    if wandb_log:
        wandb.finish()

except keyboardInterrupt:
    print("training interrupted, cleaning up..")

finally:
    # release gpu memory
    torch.cuda.empty_cache()
    print("GPU memory released")
    sys.exit(0)


  0%|                                                                                                                                                | 0/100000 [00:00<?, ?it/s]

[loss diff] pytorch:8.4375 Manual:8.375
[loss diff] pytorch:8.4375 Manual:8.375
[loss diff] pytorch:8.4375 Manual:8.375
[loss diff] pytorch:8.4375 Manual:8.375
[loss diff] pytorch:8.4375 Manual:8.375
[loss diff] pytorch:8.4375 Manual:8.375
[loss diff] pytorch:8.4375 Manual:8.375

0: train loss: 8.4375/ val loss: 8.4375
Once upon a time causedersey pain regionher port Rob Jer Jr over�ouncada Don Pakistan Marthy school that mer community common mag muchransynasty British Cle problemsological occur neededatives just� skin housir using Russia ratherme contro known partic askound story Wrestore published Minn tr campW minister center Alsoionalrdella Christian Soviet blue
[Checkpoint]: save with loss:  8.4375


  0%|                                                                                                                                                | 0/100000 [17:12<?, ?it/s]

GPU memory released





SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
