###### Imports

In [1]:
from importlib.metadata import version
ver = lambda libstr: print(f'{libstr} version: {version(libstr)}')
ver("torch")
ver("tiktoken")
ver("mlPLayGround")

torch version: 2.5.1+cu118
tiktoken version: 0.8.0
mlPLayGround version: 0.1.0


In [2]:
import tiktoken
from mlPlayGround.recurrent.attention import multiHeadAttention, multiHeadAttentionTorch, multiHeadAttentionTorchSDP

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"PyTorch version: {torch.__version__}")
print(f"Running on {device}")

PyTorch version: 2.5.1+cu118
Running on cuda


In [9]:
batch_size = 8
context_len = 1024
embed_dim = 768
n_heads = 12
dropout = 0.0
bias = False

embeddings = torch.randn((batch_size, context_len, embed_dim), device=device)

In [10]:
mha = multiHeadAttention(embed_dim, embed_dim, n_heads,
                         context_len, dropout, bias).to(device)
# print(mha(embeddings).shape)
%timeit mha(embeddings)

58.1 ms ± 120 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [11]:
mha = multiHeadAttentionTorch(embed_dim, embed_dim, n_heads,
                              context_len, dropout, bias, False).to(device)
# print(mha(embeddings).shape)
%timeit mha(embeddings)

31 ms ± 30.1 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [12]:
mha = multiHeadAttentionTorch(embed_dim, embed_dim, n_heads,
                              context_len, dropout, bias, True).to(device)
# print(mha(embeddings).shape)
%timeit mha(embeddings)

61.5 ms ± 112 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [13]:
mha = multiHeadAttentionTorchSDP(embed_dim, embed_dim, n_heads,
                                 context_len, dropout, bias).to(device)
# print(mha(embeddings).shape)
%timeit mha(embeddings)

21.4 ms ± 26.6 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [17]:
a, b = zip(*[(i, 5*i) for i in range(20)])
print(a)
print(b)

(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19)
(0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95)


##### Buildin an LLM

###### Tokenization

In [11]:
# load the data
import os
import urllib.request

fname = "../data/llmbuild/the-verdict.txt"
if not os.path.exists(fname):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    urllib.request.urlretrieve(url, fname)

with open(fname, "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


We would like to tokenize and embed the bove test. To begin we create a simple tokenizer using some sample text

In [50]:
# Bytepair encoding using tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
tokens = tokenizer.encode(raw_text)
print(tokenizer.decode(tokens) == raw_text)

True


In [53]:
torch.manual_seed(123)
dropout = torch.nn.Dropout(0.5) # dropout rate of 50%
example = torch.ones(6, 6) # create a matrix of ones

print(dropout(example))

tensor([[2., 2., 2., 2., 2., 2.],
        [0., 2., 0., 0., 0., 0.],
        [0., 0., 2., 0., 2., 0.],
        [2., 2., 0., 0., 0., 2.],
        [2., 0., 0., 0., 0., 2.],
        [0., 2., 0., 0., 0., 0.]])


In [54]:
print(example)

tensor([[1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1.]])


In [62]:
print(torch.sum(example))
print(torch.sum(dropout(example)))

tensor(36.)
tensor(36.)


In [28]:
import re

text = "Hello, world. This, is a test."
result = [item for item in re.split(r'([,.:;?_!"()\']|--|\s)', text) if item.strip()] # re and ignore empty strings
print(result)

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [31]:
preproc = [item for item in re.split(r'([,.:;?_!"()\']|--|\s)', raw_text) if item.strip()] # re and ignore empty strings
print(len(preproc))
print(preproc[:30])

4690
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [32]:
# get unique tokens
all_tokens = sorted(set(preproc))
vocab_size = len(all_tokens)
print(vocab_size)

1130


In [33]:
vocab = {t:i for i, t in enumerate(all_tokens)}

In [34]:
def get_pairs(word):
    """Return set of symbol pairs in a word.

    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs

TODO: how do audio and video embeddings work?