In [1]:
import os
import urllib.request

In [2]:
if not os.path.exists("the-verdict.txt"):
    url = ("https://raw.githubusercontent.com/rasbt/"
           "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
           "the-verdict.txt")
    file_path = "the-verdict.txt"
    urllib.request.urlretrieve(url, file_path)

In [3]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [4]:
print(f"Total number of character: {len(raw_text):_}")
print(raw_text[:300])

Total number of character: 20_479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would ha


# Tokenization

In [5]:
import tiktoken

In [6]:
import sys

# pympler will give you a more accurate picture since it recursively measures all
# the memory used by the object and its contents.
from pympler import asizeof

def cmp_sizes(obj):
    sys_size_mb =  sys.getsizeof(obj) / 1024 / 1024
    pympler_size_mb = asizeof.asizeof(obj) / 1024 / 1024
    print(f"Total size according to sys: {sys_size_mb:.5f} MB")
    print(f"Total size according to pympler: {pympler_size_mb:.2f} MB")

In [7]:
tokenizer = tiktoken.get_encoding("gpt2")

cmp_sizes(tokenizer)

Total size according to sys: 0.00005 MB
Total size according to pympler: 6.08 MB


In [8]:
tokenizer = tiktoken.get_encoding('cl100k_base')
cmp_sizes(tokenizer)

Total size according to sys: 0.00005 MB
Total size according to pympler: 12.13 MB


In [9]:
cmp_sizes(tiktoken.get_encoding('o200k_base'))

Total size according to sys: 0.00005 MB
Total size according to pympler: 24.31 MB


In [10]:
cmp_sizes(raw_text)

Total size according to sys: 0.01957 MB
Total size according to pympler: 0.02 MB


In [11]:
enc_text = tokenizer.encode(raw_text)
cmp_sizes(enc_text)

Total size according to sys: 0.03777 MB
Total size according to pympler: 0.17 MB


# Input-Target Pairs

In [12]:
ctx_size = 4

In [13]:
x = enc_text[: ctx_size]
y = enc_text[1: ctx_size+1]

print(f'{x=}')
print(f'    {y=}')

x=[40, 473, 1846, 2744]
    y=[473, 1846, 2744, 3463]


In [14]:
for i in range(1, ctx_size+1):
    context = enc_text[:i]
    desired = enc_text[i]
    print(f'{context} --> {desired}')
    print(f'\t{tokenizer.decode(context)} --> {tokenizer.decode([desired])}')

[40] --> 473
	I -->  H
[40, 473] --> 1846
	I H --> AD
[40, 473, 1846] --> 2744
	I HAD -->  always
[40, 473, 1846, 2744] --> 3463
	I HAD always -->  thought


# Pytorch

In [15]:
import torch
print("PyTorch version:", torch.__version__)

PyTorch version: 2.6.0


In [16]:
torch.backends.mps.is_available()

True

In [17]:
torch.backends.cpu.get_cpu_capability()

'NO AVX'

In [18]:
if torch.cuda.is_available():
    _device = 'cuda'
elif torch.backends.mps.is_available():
    _device = 'mps'
else:
    _device = 'cpu'

device = torch.device(_device)
device

device(type='mps')

In [19]:
def to_numpy(tensor):
    # detach() detaches it from the current graph - result will never require gradient.
    # cpu() copies it to the CPU.
    return tensor.detach().cpu().numpy() if tensor.requires_grad else tensor.cpu().numpy()

## Pytorch Data Loaders

In [20]:
from torch.utils.data import Dataset, DataLoader

In [21]:
# Types https://pytorch.org/docs/stable/tensors.html
torch.iinfo(torch.int)

iinfo(min=-2.14748e+09, max=2.14748e+09, dtype=int32)

In [22]:
def print_type_range(dtype):
    info = torch.iinfo(dtype)
    print(f"{dtype}: {info.min:_d} to {info.max:_d}")

In [23]:
print_type_range(torch.short)
print_type_range(torch.int)
print_type_range(torch.long)

torch.int16: -32_768 to 32_767
torch.int32: -2_147_483_648 to 2_147_483_647
torch.int64: -9_223_372_036_854_775_808 to 9_223_372_036_854_775_807


In [24]:
# Example to make sense of the sliding window for-loop below.
# Takeaway: range is inclusive on the left, exclusive on the right.
[i for i in range(0, 10, 2)]

[0, 2, 4, 6, 8]

In [25]:
# Example to make sense of the sliding window for-loop below.
# Takeaway: if we know the total number of token IDs, we can create
# len(token IDs) - max_length sequences of size max_lenght at most (if stride is 1).
_len_token_ids = 10
_max_length = 4      # This is the lenght of the seuquences we want to create
_stride = 1


[(i, i+_max_length) for i in range(0, _len_token_ids - _max_length, _stride)]

[(0, 4), (1, 5), (2, 6), (3, 7), (4, 8), (5, 9)]

In [26]:
len(tokenizer.encode(raw_text))

4943

In [27]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        
        # Tokenize the entire text.
        token_ids = tokenizer.encode(txt)

        # Use a sliding window to chunk the book into overlapping sequences of max_length.
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i+max_length]
            target_chunk = token_ids[i+1 : i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        """
        Return the total number of rows in the dataset
        """
        return len(self.input_ids)

    def __getitem__(self, idx):
        """
        Return row idx from the dataset
        """
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(
    txt, /, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0,
    encoding='cl100k_base',
):
    # Initialize the tokenizer.
    tokenizer = tiktoken.get_encoding(encoding)

    # Create dataset.
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader.
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [28]:
# Create batches of size max_lenght.
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)

data_iter = iter(dataloader)
first_batch = next(data_iter)
first_batch

[tensor([[  40,  473, 1846, 2744]]), tensor([[ 473, 1846, 2744, 3463]])]

In [29]:
for batch_idx, (features, labels) in enumerate(dataloader):
    features, labels = features.to(device), labels.to(device)
    print(f'{batch_idx=}\n\t{features=}\n\t{labels=}')

    if batch_idx > 1:
        break

batch_idx=0
	features=tensor([[  40,  473, 1846, 2744]], device='mps:0')
	labels=tensor([[ 473, 1846, 2744, 3463]], device='mps:0')
batch_idx=1
	features=tensor([[ 473, 1846, 2744, 3463]], device='mps:0')
	labels=tensor([[1846, 2744, 3463, 7762]], device='mps:0')
batch_idx=2
	features=tensor([[1846, 2744, 3463, 7762]], device='mps:0')
	labels=tensor([[2744, 3463, 7762,  480]], device='mps:0')


In [30]:
# Changed the batch size.
dataloader = create_dataloader_v1(raw_text, batch_size=2, max_length=4, stride=1, shuffle=False)

for batch_idx, (features, labels) in enumerate(dataloader):
    features, labels = features.to(device), labels.to(device)
    print(f'{batch_idx=}\n  {features=}\n  {labels=}')

    if batch_idx > 1:
        break

batch_idx=0
  features=tensor([[  40,  473, 1846, 2744],
        [ 473, 1846, 2744, 3463]], device='mps:0')
  labels=tensor([[ 473, 1846, 2744, 3463],
        [1846, 2744, 3463, 7762]], device='mps:0')
batch_idx=1
  features=tensor([[1846, 2744, 3463, 7762],
        [2744, 3463, 7762,  480]], device='mps:0')
  labels=tensor([[2744, 3463, 7762,  480],
        [3463, 7762,  480,  285]], device='mps:0')
batch_idx=2
  features=tensor([[ 3463,  7762,   480,   285],
        [ 7762,   480,   285, 22464]], device='mps:0')
  labels=tensor([[ 7762,   480,   285, 22464],
        [  480,   285, 22464,  4856]], device='mps:0')


## Notes

* Smaller batches fit beter in memory but they may lead to noisier model updates
* overlapping chunks (stride smaller than max lenght) may cause overfitting

### Digression: multicolinearity

In [32]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [31]:
# OneHotEncoder will always produce multicollinearity unless you specifically tell it not to.
# For any categorical variable with n categories, OneHotEncoder by default creates n binary columns where:
# 
# Each row must have exactly one 1 and the rest 0s
# Therefore, the sum across any row must = 1
# This means if you know n-1 columns, you can perfectly predict the nth column
#
# You can avoid this by:
#
# Using drop='first' parameter in OneHotEncoder to drop the first column
# Or using drop='if_binary' which drops a column only when encoding binary features
color_ohe = OneHotEncoder()

In [32]:
colors = np.array(['red', 'blue', 'green'])
colors = colors.reshape(-1, 1)
colors

array([['red'],
       ['blue'],
       ['green']], dtype='<U5')

In [33]:
ohe_matrix = color_ohe.fit_transform(colors).toarray()
ohe_matrix

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [34]:
# Use the transpose to correlate columns against columns.
# Otherwise you'll correlate rows, which is not what you want.
#
# The diagonal is always 1.0 (a variable perfectly correlates with itself).
# The off-diagonal values of -0.5 show negative correlations between different colors.
# When one color is present (1), both other colors must be absent (0).
# This creates a negative relationship: knowing one color is present tells you the others must be absent.
# The -0.5 specifically comes from the balanced nature of your data (equal numbers of each color).
np.corrcoef(ohe_matrix.T)

array([[ 1. , -0.5, -0.5],
       [-0.5,  1. , -0.5],
       [-0.5, -0.5,  1. ]])

In [35]:
colors_v2 = np.array(['red', 'blue', 'green', 'red', 'green']).reshape(-1, 1)
colors_v2

array([['red'],
       ['blue'],
       ['green'],
       ['red'],
       ['green']], dtype='<U5')

In [36]:
ohe_matrix_v2 = color_ohe.fit_transform(colors_v2).toarray()
ohe_matrix_v2

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [37]:
np.corrcoef(ohe_matrix_v2.T)

array([[ 1.        , -0.40824829, -0.40824829],
       [-0.40824829,  1.        , -0.66666667],
       [-0.40824829, -0.66666667,  1.        ]])

In [38]:
# Correlation of red vs. blue is less than red vs. green.
# Red is more frequent, followed by green.
# Correlation of blue vs. green is even more negative.
colors_v3 = np.array(['red', 'blue', 'green', 'red', 'green', 'red']).reshape(-1, 1)
ohe_matrix_v3 = color_ohe.fit_transform(colors_v3).toarray()
np.corrcoef(ohe_matrix_v3.T)

array([[ 1.        , -0.31622777, -0.4472136 ],
       [-0.31622777,  1.        , -0.70710678],
       [-0.4472136 , -0.70710678,  1.        ]])

In [39]:
np.linalg.matrix_rank(ohe_matrix_v2)

np.int64(3)

# Embeddings

In [40]:
dummy_input_ids = torch.tensor([1, 2, 3, 4, 5])

In [41]:
vocab_size = 6
output_dim = 3

torch.manual_seed(123)

embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
embedding_layer.weight

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)

In [42]:
embedding_layer( torch.tensor([3]) )

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)

In [43]:
embedding_layer( torch.tensor([3]) ) == embedding_layer.weight[3, :]

tensor([[True, True, True]])

In [44]:
embedding_layer( torch.tensor([1]) ) == embedding_layer.weight[1, :]

tensor([[True, True, True]])

In [45]:
embedding_layer( dummy_input_ids )

tensor([[ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], grad_fn=<EmbeddingBackward0>)

In [46]:
onehot = torch.nn.functional.one_hot( dummy_input_ids )
onehot

tensor([[0, 1, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0],
        [0, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 1, 0],
        [0, 0, 0, 0, 0, 1]])

In [47]:
torch.manual_seed(123) # and YES! we DO need this again!

linear = torch.nn.Linear(vocab_size, output_dim, bias=False)
linear.weight

Parameter containing:
tensor([[-0.1665,  0.0135, -0.2028,  0.1540, -0.3479,  0.2993],
        [-0.2967, -0.3246, -0.2580,  0.1849, -0.1508,  0.1528],
        [-0.3465, -0.2477, -0.1499, -0.0802, -0.3114,  0.2673]],
       requires_grad=True)

In [48]:
linear.weight = torch.nn.Parameter(embedding_layer.weight.T)
linear.weight

Parameter containing:
tensor([[ 0.3374,  0.9178,  1.2753, -0.4015, -1.1589, -2.8400],
        [-0.1778,  1.5810, -0.2010,  0.9666,  0.3255, -0.7849],
        [-0.1690,  1.3010, -0.1606, -1.1481, -0.6315, -1.4096]],
       requires_grad=True)

In [49]:
# Matrix multiplication XW^T.
# The matmul by the one-got-encoding is "just" a multiplication by the identity.
# So we get the same embedding.
# But the thing to keep in mindis that the embedding was initialized with random weights,
# so the embedding is just a different way to do a OneHotEncoding - prefered because this is now
# a "normal" layer that can be optimized via backpropagation.
linear(onehot.float())

tensor([[ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], grad_fn=<MmBackward0>)

## Positional Embeddings

In [50]:
tokenizer.n_vocab

100277

In [51]:
vocab_size = tokenizer.n_vocab
output_dim = 256 # Size of our embeddings.

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
token_embedding_layer

Embedding(100277, 256)

In [52]:
max_length = 4
dataloader = create_dataloader_v1(
    raw_text,
    batch_size=8,
    max_length=max_length,
    stride=max_length,
    shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
inputs.shape

torch.Size([8, 4])

In [53]:
token_embeddings = token_embedding_layer(inputs)
token_embeddings.shape

torch.Size([8, 4, 256])

In [54]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
pos_embeddings.shape

torch.Size([4, 256])

In [55]:
input_embeddings = token_embeddings + pos_embeddings
input_embeddings.shape

torch.Size([8, 4, 256])