In [37]:
from pathlib import Path
import llm_from_scratch.c2


VERDICT_PATH = Path(llm_from_scratch.c2.__file__).with_name("the-verdict.txt")

RAW_VERDICT_TEXT = VERDICT_PATH.read_text()
print("Total number of character:", len(RAW_VERDICT_TEXT))
print(RAW_VERDICT_TEXT[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [38]:
import re

text = "Hello, world. This, is a test."
result = re.split(r"(\s)", text)
print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [39]:
result = re.split(r"([,.|\s])", text)
print(result)

result = [word for word in result if word.strip()]
print(result)

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']
['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test', '.']


In [40]:
import re

def tokenize(text : str) -> list[str]:
    result = re.split(r'([,.:;?_!"\(\)\']|--|\s)', text)
    result = [item.strip() for item in result]
    result = [item for item in result if item]
    return result

In [41]:
text = "Hello, world. Is this-- a test?"
print(tokenize(text))

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [42]:
VERDICT_TOKENS = tokenize(RAW_VERDICT_TEXT)
print("The verdict has this many tokens:", len(VERDICT_TOKENS))

The verdict has this many tokens: 4690


In [43]:
print(VERDICT_TOKENS[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [44]:
ALL_WORDS = sorted(set(VERDICT_TOKENS))
VOCAB_SIZE = len(ALL_WORDS)
print("Vocabulary size is:", VOCAB_SIZE)

Vocabulary size is: 1130


In [45]:
VOCAB = {word : idx for idx, word in enumerate(ALL_WORDS)}

In [46]:
for item, _ in zip(VOCAB.items(), range(30)):
    print(item)

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)
('Burlington', 21)
('But', 22)
('By', 23)
('Carlo', 24)
('Chicago', 25)
('Claude', 26)
('Come', 27)
('Croft', 28)
('Destroyed', 29)


In [47]:
class SimpleTokenizerV1:
    def __init__(self, vocab : dict[str, int]) -> None:
        self.str_to_int = vocab            #1
        self.int_to_str = {i:s for s,i in vocab.items()}        #2

    def encode(self, text : str) -> list[int]:         #3
        preprocessed = re.split(r'([,.:;?_!\"\(\)\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed]
        preprocessed = [item for item in preprocessed if item]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids : list[int]) -> str:         #4
        text = " ".join([self.int_to_str[i] for i in ids]) 
        text = re.sub(r'\s+([,.?!\"\(\)\'])', r'\1', text)    #5
        return text


In [48]:
TOKENIZER = SimpleTokenizerV1(VOCAB)

In [49]:
text = """"It's the last he painted, you know," 
       Mrs. Gisburn said with pardonable pride."""
ids = TOKENIZER.encode(text)

print("The ids are:")
print(ids)

print("The decoded text is:")
print(TOKENIZER.decode(ids))


The ids are:
[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
The decoded text is:
" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [50]:
TOKENIZER.encode("Hello, do you like tea?")

KeyError: 'Hello'

In [51]:
ALL_WORDS = sorted(set(VERDICT_TOKENS))
print("Type of ALL_WORDS is", type(ALL_WORDS))
ALL_WORDS += ["<|endoftext|>", "<|unk|>"]
print("Length of all words is", len(ALL_WORDS))
VOCAB = {word : idx for idx, word in enumerate(ALL_WORDS)}

print("Vocab size is:", len(VOCAB))

for i, item in enumerate(list(VOCAB.items())[-5:]):
    print(item)

Type of ALL_WORDS is <class 'list'>
Length of all words is 1132
Vocab size is: 1132
('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


In [52]:
class SimpleTokenizerV2:
    def __init__(self, vocab : dict[str, int]) -> None:
        self.str_to_int = vocab            #1
        self.int_to_str = {i:s for s,i in vocab.items()}        #2
        self.unk_idx = self.str_to_int["<|unk|>"]

    def encode(self, text : str) -> list[int]:         #3
        preprocessed = re.split(r'([,.:;?_!\"\(\)\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed]
        preprocessed = [item for item in preprocessed if item]
        ids = [self.str_to_int.get(s, self.unk_idx) for s in preprocessed]
        return ids

    def decode(self, ids : list[int]) -> str:         #4
        text = " ".join([self.int_to_str[i] for i in ids]) 
        text = re.sub(r'\s+([,.?!\"\(\)\'])', r'\1', text)    #5
        return text


TOKENIZER = SimpleTokenizerV2(VOCAB)

In [53]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

encoded = TOKENIZER.encode(text)
decoded = TOKENIZER.decode(encoded)

print("Encoded text:")
print(encoded)

print("Decoded text:")
print(decoded)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.
Encoded text:
[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]
Decoded text:
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


In [54]:
import tiktoken
from importlib.metadata import version

print("tiktoken version method 1:", version("tiktoken"), tiktoken.__version__)



tiktoken version method 1: 0.8.0 0.8.0


In [55]:
GPT2_TOKENIZER = tiktoken.get_encoding("gpt2")


In [56]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = GPT2_TOKENIZER.encode(text, allowed_special={"<|endoftext|>"})
print("Encoded text is: ")
print(integers)

decoded = GPT2_TOKENIZER.decode(integers)
print("Decoded text is: ")
print(decoded)

Encoded text is: 
[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]
Decoded text is: 
Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [57]:
enc = GPT2_TOKENIZER.encode("Akwirw ier")
print(enc)
print(GPT2_TOKENIZER.decode(enc))

[33901, 86, 343, 86, 220, 959]
Akwirw ier


In [58]:
enc_text = GPT2_TOKENIZER.encode(RAW_VERDICT_TEXT)
print("Tokens in the set:", len(enc_text))

enc_sample = enc_text[50:]

print("First sample")
context_size = 4         #1
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

Tokens in the set: 5145
First sample
x: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


In [59]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "---->", desired)

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [62]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(GPT2_TOKENIZER.decode(context), "---->", GPT2_TOKENIZER.decode([desired]))  


 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


In [63]:
import torch
from typing import Protocol
from torch.utils.data import Dataset


class Tokenizer(Protocol):
    def encode(self, value: str) -> list[int]:
        ...
    def decode(self, value: list[int]) -> str:
        ...


class GPTDatasetV1(Dataset):
    def __init__(self, txt : str, tokenizer : Tokenizer, max_length : int, stride : int) -> None:
        self.input_ids = list[torch.Tensor]()
        self.target_ids = list[torch.Tensor]()

        token_ids = tokenizer.encode(txt)    #1

        for i in range(0, len(token_ids) - max_length, stride):     #2
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self) -> int:    #3
        return len(self.input_ids)

    def __getitem__(self, idx : int) -> tuple[torch.Tensor, torch.Tensor]:         #4
        return self.input_ids[idx], self.target_ids[idx]

In [69]:
dataset = GPTDatasetV1(RAW_VERDICT_TEXT, tokenizer=GPT2_TOKENIZER, max_length=5, stride=1)

print(len(dataset))
print(dataset[4])


5140
(tensor([ 1807,  3619,   402,   271, 10899]), tensor([ 3619,   402,   271, 10899,  2138]))


In [71]:
from torch.utils.data import DataLoader


def create_dataloader_v1(
    txt : str,
    batch_size: int=4,
    max_length: int=256,
    stride: int=128,
    shuffle: bool =True,
    drop_last: bool =True,
    num_workers: int=0,
) -> GPTDatasetV1:
    tokenizer = tiktoken.get_encoding("gpt2")                         #1
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)   #2
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,     #3
        num_workers=num_workers     #4
    )
    return dataloader


In [80]:
dataloader = create_dataloader_v1(RAW_VERDICT_TEXT, batch_size=2, max_length=5, stride=2, shuffle=False)

for item, _ in zip(dataloader, range(4)):
    x = item[0]
    y = item[1]

    print(x)
    print(y)
    # 2 is for the num batches
    # 5 is the window size.
    assert y.shape == x.shape == torch.Size([2, 5])
    print("--")

tensor([[  40,  367, 2885, 1464, 1807],
        [2885, 1464, 1807, 3619,  402]])
tensor([[ 367, 2885, 1464, 1807, 3619],
        [1464, 1807, 3619,  402,  271]])
--
tensor([[ 1807,  3619,   402,   271, 10899],
        [  402,   271, 10899,  2138,   257]])
tensor([[ 3619,   402,   271, 10899,  2138],
        [  271, 10899,  2138,   257,  7026]])
--
tensor([[10899,  2138,   257,  7026, 15632],
        [  257,  7026, 15632,   438,  2016]])
tensor([[ 2138,   257,  7026, 15632,   438],
        [ 7026, 15632,   438,  2016,   257]])
--
tensor([[15632,   438,  2016,   257,   922],
        [ 2016,   257,   922,  5891,  1576]])
tensor([[ 438, 2016,  257,  922, 5891],
        [ 257,  922, 5891, 1576,  438]])
--


In [83]:
import torch

vocab_size=6
output_dim=3

torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

# This might not match the book is using GPU!
print(embedding_layer.weight)


Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [85]:
x = torch.tensor([3])
y = embedding_layer(x)
print(x.shape, y.shape)
print(embedding_layer(x))


torch.Size([1]) torch.Size([1, 3])
tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


In [86]:
x = torch.tensor([2, 3, 5, 1])
y = embedding_layer(x)
print(x.shape, y.shape)
print(embedding_layer(x))


torch.Size([4]) torch.Size([4, 3])
tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


In [None]:
# Embedding layer
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

# Data loader
max_length = 4
dataloader = create_dataloader_v1(
    RAW_VERDICT_TEXT, batch_size=8, max_length=max_length,
   stride=max_length, shuffle=False
)

# print some examples
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDs:\n", inputs)
print("Inputs shape:\n", inputs.shape)

token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Inputs shape:
 torch.Size([8, 4])
torch.Size([8, 4, 256])


In [94]:
# Adding the positional
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

# print(pos_embeddings_layer.weight.shape) will also do the same


torch.Size([4, 256])


In [101]:
# Torch allows adding like this. It adds the sub matrix to all the outer dim of the inner one.

print(torch.tensor([1]) + torch.tensor([10]))

print(torch.tensor([1, 2]) + torch.tensor([10]))

print(torch.tensor([[1], [3]]) + torch.tensor([10]))

print(torch.tensor([[[1], [2]], [[3], [4]]]) + torch.tensor([10]))

tensor([11])
tensor([11, 12])
tensor([[11],
        [13]])
tensor([[[11],
         [12]],

        [[13],
         [14]]])


In [None]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)