In [None]:
import os
from pathlib import Path
import regex as re
import json
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
from typing import Tuple, List, Iterable, BinaryIO
from collections import Counter
from bpe_tokenizer import BytePairEncodingTokenizer
import torch.nn as nn

In [2]:
data_path = "/scratch/shayan/Projects/LLMfromScratch/data/TinyStoriesV2-GPT4-train.txt"

with open(data_path, "r") as f:
    for i, line in enumerate(f):
        if i < 1000:
            continue
        if i >= 1050:
            break
        print(f"Line {i+1}: {line.strip()}")

Line 1001: One day, a big dog named Max saw a small cat named Lily on top of a tree. Lily was angry because she could not get down. Max wanted to help Lily, so he thought of a plan.
Line 1002: Max said, "Lily, I will join you on top of the tree and help you get down." Max climbed up the tree and slowly got closer to Lily. Lily was scared at first, but Max was kind and gentle.
Line 1003: Max said, "Hold on to me, Lily. I will take you down." Lily held on tight to Max, and they went down the tree together. Lily was happy and thanked Max for helping her. From that day on, Max and Lily became the best of friends.
Line 1004: <|endoftext|>
Line 1005: Once upon a time, there was a white shark. The white shark lived in the big sea. One day, the white shark saw a little boat. The little boat had a hole in it. The white shark wanted to help.
Line 1006: The white shark swam to the boat. The white shark said, "I can fix your boat." The man in the boat was scared. The man said, "No, go away!" The w

In [13]:
# loading the data
with open(data_path, "r") as f:
    data = f.read()

len(data)

2226845268

In [14]:
# pre-tokenize the data regex-based GPT-2 style 
from tqdm import tqdm

PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
TOKEN_BYTES = b"<|endoftext|>"

# chunk_size = 1000000
# tokens = []

# for i in tqdm(range(0, len(data), chunk_size), desc="pre-tokenizing the vocabulary"):
#     chunk = data[i:i+chunk_size]
#     tokens.extend(re.findall(PAT, chunk))

In [None]:
def save_bpe(vocab, merges, output_dir, data_name):
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    if isinstance(vocab, dict):
        vocab_json = vocab
    else:
        raise TypeError("Vocabulary must be a dict of token->id")
    
    with (output_dir/f"{data_name}_vocab.json").open("w", encoding="utf-8") as f:
        json.dump(vocab_json, f, ensure_ascii=False, indent=2)

    merges_path = output_dir / f"{data_name}_merges.txt"
    with merges_path.open("w", encoding="utf-8") as f:
        for a, b in merges:
            f.write(f"{a} {b}\n")

def train_bpe_tinystories(data_path, vocab_size=10000, special_tokens=["<|endoftext|>"], out_dir="tokenizer"):
    bpe = BytePairEncodingTokenizer(data_path)
    vocabulary, merges = bpe.train_bpe(data_path, vocab_size=vocab_size, special_tokens=["<|endoftext|>"])
    save_bpe(vocabulary, merges, out_dir)

In [10]:
train_bpe_tinystories(data_path, vocab_size=10000)

Vocabulary Length: 257


tokenizing chunks: 100%|██████████| 24/24 [00:26<00:00,  1.08s/it]
Training BPE...: 100%|██████████| 9743/9743 [34:15<00:00,  4.74it/s, last merge: 10 chars]


In [3]:
with open("tokenizer/vocab.json", "r") as f:
    vocab = json.load(f)
with open("tokenizer/merges.txt", "r") as f:
    merges = f.read()

tokenizer = BytePairEncodingTokenizer.from_files(vocab_path="tokenizer/vocab.json", merges_path="tokenizer/merges.txt")

text = "This is a test for an interesting implementation of a BPE tokenizer. it was very exciting to learn all the detials"

ids = tokenizer.encode(text)
print(ids)


[1531, 431, 259, 2569, 387, 420, 2330, 1003, 2020, 377, 1553, 370, 259, 374, 81, 70, 266, 1343, 940, 282, 47, 309, 283, 378, 2929, 266, 613, 432, 263, 7278, 844, 116]


In [None]:
tokenizer = BytePairEncodingTokenizer.from_files(vocab_path="tokenizer/vocab.json", merges_path="tokenizer/merges.txt")

roundtrip = tokenizer.decode(tokenizer.encode("hello world!"))
assert roundtrip == "hello world!"

In [1]:
import torch
import torch.optim as optim

In [23]:
import torch
import torch.optim as optim
import math

class SGDOptimizer(optim.Optimizer):
    def __init__(self, params, **args):
        if "lr" not in args:
            raise KeyError("learning rate not provided")
        elif args["lr"] < 0:
            raise ValueError(f"Invalid learning rate: {args['lr']}")
        
        super().__init__(params, args)
        
    
    def step(self, closure = None):
        loss = None if closure is None else closure()
        for group in self.param_groups:
            lr = group["lr"] # get learning rate

            for p in group["params"]:
                if p.grad is None:
                    continue

                state = self.state[p]
                t = state.get("t", 0)
                grad = p.grad.data
                p.data -= lr / math.sqrt(t+1) * grad # update weight tensor in-place
                state["t"] = t + 1

        return loss


In [24]:
weights = torch.nn.Parameter(5 * torch.randn((10, 10)))
opt = SGDOptimizer([weights], lr=1)
for t in range(100):
    opt.zero_grad() # Reset the gradients for all learnable parameters.
    loss = (weights**2).mean() # Compute a scalar loss value.
    print(loss.cpu().item())
    loss.backward() # Run backward pass, which computes gradients.
    opt.step()

27.98601722717285
26.87777328491211
26.122926712036133
25.52312660217285
25.015216827392578
24.56973648071289
24.170150756835938
23.806114196777344
23.47063446044922
23.158737182617188
22.866724014282227
22.591772079467773
22.3316593170166
22.0846004486084
21.849132537841797
21.624059677124023
21.408361434936523
21.201171875
21.001758575439453
20.80947494506836
20.623764038085938
20.44413948059082
20.270160675048828
20.101449966430664
19.937658309936523
19.77847671508789
19.623624801635742
19.472850799560547
19.325929641723633
19.182647705078125
19.042814254760742
18.90625
18.77280044555664
18.642311096191406
18.514644622802734
18.38967514038086
18.26728057861328
18.147354125976562
18.029788970947266
17.914491653442383
17.801368713378906
17.690338134765625
17.58131980895996
17.47423553466797
17.369022369384766
17.265607833862305
17.16393280029297
17.063934326171875
16.965557098388672
16.868749618530273
16.773460388183594
16.679641723632812
16.587247848510742
16.496234893798828
16.40656

In [32]:
weights = torch.nn.Parameter(5 * torch.randn((10, 10)))
opt = SGDOptimizer([weights], lr=1e-4)
for t in range(100):
    opt.zero_grad() # Reset the gradients for all learnable parameters.
    loss = (weights**2).mean() # Compute a scalar loss value.
    print(loss.cpu().item())
    loss.backward() # Run backward pass, which computes gradients.
    opt.step()

21.545366287231445
21.54528045654297
21.54521942138672
21.545167922973633
21.545127868652344
21.545089721679688
21.545053482055664
21.545021057128906
21.54499053955078
21.544960021972656
21.544933319091797
21.544906616210938
21.544883728027344
21.544858932495117
21.544836044311523
21.544815063476562
21.544790267944336
21.54477310180664
21.544750213623047
21.54473304748535
21.54471206665039
21.544694900512695
21.544677734375
21.544654846191406
21.544639587402344
21.544618606567383
21.544605255126953
21.544588088989258
21.544572830200195
21.544557571411133
21.544540405273438
21.544527053833008
21.544513702392578
21.544496536254883
21.544483184814453
21.544466018676758
21.544450759887695
21.5444393157959
21.544424057006836
21.544410705566406
21.544397354125977
21.54438018798828
21.54436683654785
21.544355392456055
21.544340133666992
21.544330596923828
21.5443172454834
21.54430389404297
21.544292449951172
21.544279098510742
21.544265747070312
21.54425621032715
21.544240951538086
21.5442333

In [28]:
weights = torch.nn.Parameter(5 * torch.randn((10, 10)))
opt = SGDOptimizer([weights], lr=1e3)
for t in range(100):
    opt.zero_grad() # Reset the gradients for all learnable parameters.
    loss = (weights**2).mean() # Compute a scalar loss value.
    print(loss.cpu().item())
    loss.backward() # Run backward pass, which computes gradients.
    opt.step()

20.563339233398438
7423.36572265625
1282132.0
142623504.0
11552503808.0
729095274496.0
37429369110528.0
1610371546742784.0
5.935485459785318e+16
1.9059502178747023e+18
5.403537580174921e+19
1.3672669511718252e+21
3.1154994746824176e+22
6.44136524047905e+23
1.2161926838581661e+25
2.1087211020420337e+26
3.373953763267254e+27
5.002893941418034e+28
6.901057082783179e+29
8.885801829076122e+30
1.0712480280033089e+32
1.2125353432841755e+33
1.2918096190649032e+34
1.2983626051452588e+35
1.2336651253205981e+36
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf
inf


In [14]:
x.data

tensor([22., 10.])

In [40]:
x = torch.tensor([33., 33., 32., 30., 32.])
torch.var(x)

tensor(1.5000)

In [47]:
x = torch.tensor([[33., 33., 32., 30., 32.], [30., 30., 320., 40., 30.]])

torch.var(x, dim=-1)

tensor([1.5000e+00, 1.6550e+04])

In [174]:
class AdamW(torch.optim.Optimizer):
    def __init__(self, params, lr=1e-3, beta_1=0.9, beta_2=0.999, weight_decay=1e-2):
        if lr < 0:
            raise ValueError(f"Invalid learning rate: {lr}")
        
        defaults = {"lr": lr, "beta_1": beta_1, "beta_2": beta_2, "weight_decay": weight_decay}
        super().__init__(params, defaults)

    def step(self, closure = None):
        loss = None if closure is None else closure
        eps = 1e-8
        for group in self.param_groups:
            lr = group['lr'] # get learning rate for param groups
            lr_initial = lr
            b1 = group["beta_1"]
            b2 = group["beta_2"]
            wd = group["weight_decay"]
            
            for p in group["params"]:
                if p.grad is None:
                    continue
                
                state = self.state[p]
                if len(state) == 0:
                    state["m"] = torch.zeros_like(p.grad.data) # first moment
                    state["v"] = torch.zeros_like(p.grad.data) # second moment
                
                t = state.get("t", 0)
                grad = p.grad.data
 
                state["m"] = b1 * state["m"] + (1 - b1) * grad
                state["v"] = b2 * state["v"] + (1 - b2) * (grad ** 2)
                lr_t = lr * math.sqrt(1 - b2**(t+1)) / (1 - b1**(t+1))
                
                p.data -= lr_t * state["m"] / (torch.sqrt(state["v"]) + eps)
                p.data -= lr * wd * p
                state["t"] = t + 1
                
        return loss

In [170]:
import torch
import math

class AdamW(torch.optim.Optimizer):
    def __init__(self, params, lr=1e-3, beta_1=0.9, beta_2=0.999, eps=1e-8, weight_decay=1e-2):
        defaults = dict(lr=lr, beta_1=beta_1, beta_2=beta_2, eps=eps, weight_decay=weight_decay)
        super().__init__(params, defaults)

    def step(self, closure=None):
        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            lr = group["lr"]
            b1, b2 = group["beta_1"], group["beta_2"]
            eps, wd = group["eps"], group["weight_decay"]

            for p in group["params"]:
                if p.grad is None:
                    continue

                grad = p.grad.data
                state = self.state[p]

                # === Initialization ===
                if len(state) == 0:
                    state["t"] = 0
                    state["m"] = torch.zeros_like(p)
                    state["v"] = torch.zeros_like(p)

                m, v = state["m"], state["v"]
                state["t"] += 1
                t = state["t"]

                # === Moment updates ===
                m.mul_(b1).add_(grad, alpha=1 - b1)
                v.mul_(b2).addcmul_(grad, grad, value=1 - b2)

                # === Compute bias-corrected learning rate (α_t) ===
                lr_t = lr * math.sqrt(1 - b2**t) / (1 - b1**t)

                # === Parameter update ===
                p.data.addcdiv_(m, v.sqrt().add_(eps), value=-lr_t)

                # === Decoupled weight decay (final step) ===
                if wd != 0:
                    p.data.add_(p.data, alpha=-lr * wd)

        return loss


In [175]:
torch.manual_seed(10)
weights = torch.nn.Parameter(5 * torch.randn((10, 10)))
opt = AdamW([weights], lr=5e-3)
for t in range(1000):
    opt.zero_grad() # Reset the gradients for all learnable parameters.
    loss = (weights**2).mean() # Compute a scalar loss value.
    if t % 50 == 0:
        print(f"t is {t} and loss is : {loss.cpu().item()}")
    loss.backward() # Run backward pass, which computes gradients.
    
    opt.step()

t is 0 and loss is : 27.16983413696289
t is 50 and loss is : 24.990659713745117
t is 100 and loss is : 22.991731643676758
t is 150 and loss is : 21.160966873168945
t is 200 and loss is : 19.482091903686523
t is 250 and loss is : 17.940616607666016
t is 300 and loss is : 16.5238094329834
t is 350 and loss is : 15.220439910888672
t is 400 and loss is : 14.0205659866333
t is 450 and loss is : 12.915322303771973
t is 500 and loss is : 11.8967924118042
t is 550 and loss is : 10.95785903930664
t is 600 and loss is : 10.092111587524414
t is 650 and loss is : 9.29373550415039
t is 700 and loss is : 8.5574312210083
t is 750 and loss is : 7.878375053405762
t is 800 and loss is : 7.252130031585693
t is 850 and loss is : 6.674632549285889
t is 900 and loss is : 6.142139434814453
t is 950 and loss is : 5.651205062866211


In [71]:
x = torch.tensor([[10., 20., 30., 40., 50.], [10., 20., 30., 40., 50.]])
a = torch.tensor([[1., 2., 3., 4., 5.], [10., 20., 30., 40., 50.]])
torch.mean(x, dim=-1, keepdim=True).expand(x.shape)

tensor([[30., 30., 30., 30., 30.],
        [30., 30., 30., 30., 30.]])

In [72]:
x / a

tensor([[10., 10., 10., 10., 10.],
        [ 1.,  1.,  1.,  1.,  1.]])