In [1]:
import os
import urllib.request

In [2]:
if not os.path.exists("the-verdict.txt"):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)

In [3]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [4]:
print(f"Total number of character: {len(raw_text):_}")
print(raw_text[:300])

Total number of character: 20_479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would ha


# Tokenization

In [5]:
import tiktoken

In [6]:
import sys

# pympler will give you a more accurate picture since it recursively measures all
# the memory used by the object and its contents.
from pympler import asizeof

def cmp_sizes(obj):
    sys_size_mb =  sys.getsizeof(obj) / 1024 / 1024
    pympler_size_mb = asizeof.asizeof(obj) / 1024 / 1024
    print(f"Total size according to sys: {sys_size_mb:.5f} MB")
    print(f"Total size according to pympler: {pympler_size_mb:.2f} MB")

In [15]:
tokenizer = tiktoken.get_encoding("gpt2")

cmp_sizes(tokenizer)

Total size according to sys: 0.00005 MB
Total size according to pympler: 6.08 MB


In [16]:
tokenizer = tiktoken.get_encoding('cl100k_base')
cmp_sizes(tokenizer)

Total size according to sys: 0.00005 MB
Total size according to pympler: 12.13 MB


In [17]:
cmp_sizes(raw_text)

Total size according to sys: 0.01957 MB
Total size according to pympler: 0.02 MB


In [18]:
enc_text = tokenizer.encode(raw_text)
cmp_sizes(enc_text)

Total size according to sys: 0.03777 MB
Total size according to pympler: 0.17 MB


# Input-Target Pairs

In [19]:
ctx_size = 4

In [20]:
x = enc_text[: ctx_size]
y = enc_text[1: ctx_size+1]

print(f'{x=}')
print(f'    {y=}')

x=[40, 473, 1846, 2744]
    y=[473, 1846, 2744, 3463]


In [21]:
for i in range(1, ctx_size+1):
    context = enc_text[:i]
    desired = enc_text[i]
    print(f'{context} --> {desired}')
    print(f'{tokenizer.decode(context)} --> {tokenizer.decode([desired])}')

[40] --> 473
I -->  H
[40, 473] --> 1846
I H --> AD
[40, 473, 1846] --> 2744
I HAD -->  always
[40, 473, 1846, 2744] --> 3463
I HAD always -->  thought


# Pytorch

In [7]:
import torch
print("PyTorch version:", torch.__version__)

PyTorch version: 2.5.1


In [8]:
torch.backends.mps.is_available()

True

In [9]:
torch.backends.cpu.get_cpu_capability()

'NO AVX'

In [10]:
if torch.cuda.is_available():
    _device = 'cuda'
elif torch.backends.mps.is_available():
    _device = 'mps'
else:
    _device = 'cpu'

_device

'mps'

In [19]:
device = torch.device(_device)
device

device(type='mps')

In [20]:
def to_numpy(tensor):
    # cpu() copies it to the CPU.
    # detach() detaches it from the current graph - result will never require gradient.
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

## Pytorch Data Loaders

In [21]:
from torch.utils.data import Dataset, DataLoader

In [22]:
# Example to make sense of the sliding window for-loop below.
[i for i in range(0, 10, 2)]

[0, 2, 4, 6, 8]

In [23]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        
        # Tokenize the entire text.
        token_ids = tokenizer.encode(txt)

        # Use a sliding window to chunk the book into overlapping sequences of max_length.
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i+max_length]
            target_chunk = token_ids[i+1 : i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk, dtype=torch.float32))
            self.target_ids.append(torch.tensor(target_chunk, dtype=torch.float32))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

def create_dataloader_v1(
    txt, /, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0,
    encoding='cl100k_base',
):
    # Initialize the tokenizer.
    tokenizer = tiktoken.get_encoding(encoding)

    # Create dataset.
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader.
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [24]:
# Example to make sense of the sliding window for-loop below.
_len_token_ids = 10
_max_length = 4
_stride = 1
[(i, i+_max_length) for i in range(0, _len_token_ids - _max_length, _stride)]

[(0, 4), (1, 5), (2, 6), (3, 7), (4, 8), (5, 9)]

In [25]:
# Create batches of size max_lenght.
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)
first_batch = next(data_iter)
first_batch

[tensor([[  40.,  473., 1846., 2744.]]),
 tensor([[ 473., 1846., 2744., 3463.]])]

In [26]:
for batch_idx, (features, labels) in enumerate(dataloader):
    features, labels = features.to(device), labels.to(device)
    print(f'{batch_idx=}\n\t{features=}\n\t{labels=}')

    if batch_idx > 1:
        break

batch_idx=0
	features=tensor([[  40.,  473., 1846., 2744.]], device='mps:0')
	labels=tensor([[ 473., 1846., 2744., 3463.]], device='mps:0')
batch_idx=1
	features=tensor([[ 473., 1846., 2744., 3463.]], device='mps:0')
	labels=tensor([[1846., 2744., 3463., 7762.]], device='mps:0')
batch_idx=2
	features=tensor([[1846., 2744., 3463., 7762.]], device='mps:0')
	labels=tensor([[2744., 3463., 7762.,  480.]], device='mps:0')


In [28]:
# Changed the batch size.
dataloader = create_dataloader_v1(raw_text, batch_size=2, max_length=4, stride=1, shuffle=False)

for batch_idx, (features, labels) in enumerate(dataloader):
    features, labels = features.to(device), labels.to(device)
    print(f'{batch_idx=}\n  {features=}\n  {labels=}')

    if batch_idx > 1:
        break

batch_idx=0
  features=tensor([[  40.,  473., 1846., 2744.],
        [ 473., 1846., 2744., 3463.]], device='mps:0')
  labels=tensor([[ 473., 1846., 2744., 3463.],
        [1846., 2744., 3463., 7762.]], device='mps:0')
batch_idx=1
  features=tensor([[1846., 2744., 3463., 7762.],
        [2744., 3463., 7762.,  480.]], device='mps:0')
  labels=tensor([[2744., 3463., 7762.,  480.],
        [3463., 7762.,  480.,  285.]], device='mps:0')
batch_idx=2
  features=tensor([[ 3463.,  7762.,   480.,   285.],
        [ 7762.,   480.,   285., 22464.]], device='mps:0')
  labels=tensor([[ 7762.,   480.,   285., 22464.],
        [  480.,   285., 22464.,  4856.]], device='mps:0')
