In [1]:
import os
from pathlib import Path
import regex as re
import json
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
from typing import Tuple, List, Iterable, BinaryIO
from collections import Counter
from bpe_tokenizer import BytePairEncodingTokenizer

In [2]:
data_path = "/scratch/shayan/Projects/LLMfromScratch/data/TinyStoriesV2-GPT4-train.txt"

with open(data_path, "r") as f:
    for i, line in enumerate(f):
        if i < 1000:
            continue
        if i >= 1050:
            break
        print(f"Line {i+1}: {line.strip()}")

Line 1001: One day, a big dog named Max saw a small cat named Lily on top of a tree. Lily was angry because she could not get down. Max wanted to help Lily, so he thought of a plan.
Line 1002: Max said, "Lily, I will join you on top of the tree and help you get down." Max climbed up the tree and slowly got closer to Lily. Lily was scared at first, but Max was kind and gentle.
Line 1003: Max said, "Hold on to me, Lily. I will take you down." Lily held on tight to Max, and they went down the tree together. Lily was happy and thanked Max for helping her. From that day on, Max and Lily became the best of friends.
Line 1004: <|endoftext|>
Line 1005: Once upon a time, there was a white shark. The white shark lived in the big sea. One day, the white shark saw a little boat. The little boat had a hole in it. The white shark wanted to help.
Line 1006: The white shark swam to the boat. The white shark said, "I can fix your boat." The man in the boat was scared. The man said, "No, go away!" The w

In [13]:
# loading the data
with open(data_path, "r") as f:
    data = f.read()

len(data)

2226845268

In [14]:
# pre-tokenize the data regex-based GPT-2 style 
from tqdm import tqdm

PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
TOKEN_BYTES = b"<|endoftext|>"

# chunk_size = 1000000
# tokens = []

# for i in tqdm(range(0, len(data), chunk_size), desc="pre-tokenizing the vocabulary"):
#     chunk = data[i:i+chunk_size]
#     tokens.extend(re.findall(PAT, chunk))

In [None]:
def save_bpe(vocab, merges, output_dir, data_name):
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    if isinstance(vocab, dict):
        vocab_json = vocab
    else:
        raise TypeError("Vocabulary must be a dict of token->id")
    
    with (output_dir/f"{data_name}_vocab.json").open("w", encoding="utf-8") as f:
        json.dump(vocab_json, f, ensure_ascii=False, indent=2)

    merges_path = output_dir / f"{data_name}_merges.txt"
    with merges_path.open("w", encoding="utf-8") as f:
        for a, b in merges:
            f.write(f"{a} {b}\n")

def train_bpe_tinystories(data_path, vocab_size=10000, special_tokens=["<|endoftext|>"], out_dir="tokenizer"):
    bpe = BytePairEncodingTokenizer(data_path)
    vocabulary, merges = bpe.train_bpe(data_path, vocab_size=vocab_size, special_tokens=["<|endoftext|>"])
    save_bpe(vocabulary, merges, out_dir)

In [10]:
train_bpe_tinystories(data_path, vocab_size=10000)

Vocabulary Length: 257


tokenizing chunks: 100%|██████████| 24/24 [00:26<00:00,  1.08s/it]
Training BPE...: 100%|██████████| 9743/9743 [34:15<00:00,  4.74it/s, last merge: 10 chars]


In [3]:
with open("tokenizer/vocab.json", "r") as f:
    vocab = json.load(f)
with open("tokenizer/merges.txt", "r") as f:
    merges = f.read()

tokenizer = BytePairEncodingTokenizer.from_files(vocab_path="tokenizer/vocab.json", merges_path="tokenizer/merges.txt")

text = "This is a test for an interesting implementation of a BPE tokenizer. it was very exciting to learn all the detials"

ids = tokenizer.encode(text)
print(ids)


[1531, 431, 259, 2569, 387, 420, 2330, 1003, 2020, 377, 1553, 370, 259, 374, 81, 70, 266, 1343, 940, 282, 47, 309, 283, 378, 2929, 266, 613, 432, 263, 7278, 844, 116]


In [4]:
tokenizer = BytePairEncodingTokenizer.from_files(vocab_path="tokenizer/vocab.json", merges_path="tokenizer/merges.txt")

roundtrip = tokenizer.decode(tokenizer.encode("hello world!"))
assert roundtrip == "hello world!"

In [1]:
import torch
from einops import rearrange, einsum

In [9]:
images = torch.randn(64, 128, 128, 3)
dim_by = torch.linspace(start=0.0, end=1.0, steps=10)
print(dim_by.shape)

dim_value = rearrange(dim_by, "dim_value -> 1 dim_value 1 1 1")
print(dim_value.shape)

images_rearr = rearrange(images, "b height width channel -> b 1 height width channel")
print(images_rearr.shape)

dimmed_images = images_rearr * dim_value
dimmed_images.shape

torch.Size([10])
torch.Size([1, 10, 1, 1, 1])
torch.Size([64, 1, 128, 128, 3])


torch.Size([64, 10, 128, 128, 3])

In [10]:
dimmed_images = einsum(
    images, dim_by,
    "batch height width channel, dim_value -> batch dim_value height width channel"
)

dimmed_images.shape

torch.Size([64, 10, 128, 128, 3])

In [20]:
channels_last = torch.rand(64, 32, 32, 3)
B = torch.rand(32*32, 32*32)
print(channels_last.shape)

height = width = 32

channels_first = rearrange(channels_last, "batch height width channel -> batch channel (height width)")
print(channels_first.shape)

channels_first_transformed = einsum(
    channels_first, B,
    "batch channel pixel_in, pixel_out pixel_in -> batch channel pixel_out"
)
print(channels_first_transformed.shape)

channels_last_transformed = rearrange(channels_first_transformed, 
                                      "batch channel (height width) -> batch height width channel",
                                      height=height, width=width)
print(channels_last_transformed.shape)


x= rearrange(channels_first_transformed,
"batch channel (height width) -> batch height width channel",
height=height, width=width
)

assert x.shape == channels_last_transformed.shape

torch.Size([64, 32, 32, 3])
torch.Size([64, 3, 1024])
torch.Size([64, 3, 1024])
torch.Size([64, 32, 32, 3])


In [103]:
# implement a the linear module

import torch.nn as nn
from math import sqrt

class Linear(nn.Module):
    def __init__(self, in_features, out_features, device=None, dtype=None):     
        super().__init__()   
        
        
        self.weights = nn.Parameter(torch.empty(out_features, in_features, dtype=dtype, device=device))
        std = sqrt(2 / (in_features + out_features))
        nn.init.trunc_normal_(self.weights, mean=0.0, std=std, a=-3*std, b=3*std)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return einsum(
            x, self.weights,
            "... d_in, d_out d_in -> ... d_out"
        )

In [104]:
head = Linear(10, 128)

x = torch.rand(64, 256, 10) # (batch, seq, h)
print(x.shape)

head.forward(x).shape # (batch, seq, h_out)

torch.Size([64, 256, 10])


torch.Size([64, 256, 128])

In [None]:
from einops import einsum

class RMSNorm(nn.Module):
    def __init__(self, d_model: int, eps: float = 1e-5, device=None, dtype=None):
        super().__init__()
        self.dtype = dtype
        self.d_model = d_model
        self.eps = eps
        gain_tensor = torch.ones(d_model, device=device, dtype=dtype)
        self.gain = nn.Parameter(gain_tensor)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        in_dtype = x.dtype
        x = x.to(torch.float32)
        mean_sq = einsum(x**2, "... d_model -> ...") / self.d_model
        rms_x = torch.sqrt(mean_sq + self.eps)
        result = einsum((x / rms_x), self.gain, "... d_model, d_model -> ... d_model")
        result = result.to(in_dtype)

        return result


In [71]:
import math 
x = torch.tensor([4, 16, 25])

rms = RMSNorm(3)

rms.forward(x)

tensor([0, 0, 1])

In [42]:
x = torch.randint(10, (6,))
torch.sum(x ** 2)

tensor(272)

In [57]:
x = torch.rand(1, 4, 4)

g = torch.rand(4)
x, g


(tensor([[[0.4136, 0.7776, 0.5524, 0.2028],
          [0.2229, 0.4884, 0.2291, 0.1290],
          [0.8178, 0.8522, 0.9098, 0.5847],
          [0.6860, 0.0350, 0.5284, 0.8094]]]),
 tensor([0.7616, 0.9247, 0.8006, 0.1996]))

In [61]:
x * g

tensor([[[0.3150, 0.7190, 0.4422, 0.0405],
         [0.1698, 0.4516, 0.1834, 0.0258],
         [0.6229, 0.7880, 0.7284, 0.1167],
         [0.5225, 0.0324, 0.4230, 0.1616]]])

In [62]:
from einops import einsum

einsum(x, g, "... d_model, d_model -> ... d_model")

tensor([[[0.3150, 0.7190, 0.4422, 0.0405],
         [0.1698, 0.4516, 0.1834, 0.0258],
         [0.6229, 0.7880, 0.7284, 0.1167],
         [0.5225, 0.0324, 0.4230, 0.1616]]])

In [95]:
x = torch.randint(0, 10, (16,4,4))
einsum(x, "... d_model-> ... 1").shape

NotImplementedError: Singleton () axes are not yet supported in einsum.

In [92]:
einsum(x, "... d_model-> ...")

tensor([[13, 14, 28, 25],
        [10, 21, 19,  9],
        [17, 32,  9, 30],
        [16, 21, 18,  8],
        [19, 19,  8, 15],
        [15, 18, 20, 24],
        [27, 15, 22, 23],
        [18, 14, 25, 12],
        [13,  7, 13, 18],
        [ 7, 10, 16,  8],
        [18, 10,  5, 30],
        [ 8,  9, 16,  5],
        [10, 18, 23, 15],
        [13, 23, 25,  4],
        [22, 17, 18, 22],
        [22, 16, 19, 17]])