### Chapter 2 Exercises and Workspace

In [1]:
from importlib.metadata import version
from importlib import reload

import tiktoken
import torch
import torch.nn as nn

In [2]:
print(f"Tiktoken version: {version('tiktoken')}")

Tiktoken version: 0.8.0


In [3]:
tokenizer = tiktoken.get_encoding('gpt2')

In [4]:
text = ("Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.")

In [5]:
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 13]


In [6]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.


**Exercise 2.1**

In [7]:
text = 'Akwirw ier'
integers = tokenizer.encode(text)
print(integers)

[33901, 86, 343, 86, 220, 959]


In [8]:
for i in integers:
    print(f"{i}: {tokenizer.decode([i])}")

33901: Ak
86: w
343: ir
86: w
220:  
959: ier


In [9]:
strings = tokenizer.decode(integers)
print(strings)

Akwirw ier


**Exercise 2.2**

In [10]:
from data_loaders import create_dataloader_v1

In [11]:
with open('the-verdict.txt', 'r') as file:
    raw_text = file.read()

In [12]:
dataloader = create_dataloader_v1(raw_text, batchsize=8, max_length=4, stride=4, shuffle=False)

In [13]:
data_iter = iter(dataloader)

In [14]:
# Only run this cell for the specific answers to problem 2.2
#first_batch = next(data_iter)
#print(first_batch)
#second_batch = next(data_iter)
#print(second_batch)

In [15]:
inputs, targets = next(data_iter)

In [16]:
print(f"Inputs: \n{inputs}")
print(f"Targets: \n{targets}")

Inputs: 
tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]], dtype=torch.int32)
Targets: 
tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]], dtype=torch.int32)


#### Practice Embedding 

In [17]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [18]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [19]:
context_length = 4
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
pos_embeddings.shape

torch.Size([4, 256])

In [20]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])


### Chapter 3 Exercises and Workspace

In [21]:
inputs = torch.tensor(
    [[0.43, 0.15, 0.89],
     [0.55, 0.87, 0.66],
     [0.57, 0.85, 0.64],
     [0.22, 0.58, 0.33],
     [0.77, 0.25, 0.10],
     [0.05, 0.80, 0.55]]
)

from attention_heads import SelfAttention_v2
from attention_heads import SelfAttention_v1

In [22]:
torch.manual_seed(789)

<torch._C.Generator at 0x123339a70>

**Exercise 3.1**

In [23]:
sa_v2 = SelfAttention_v2(3, 2)
sa_v1 = SelfAttention_v1(3, 2)

In [24]:
print(sa_v2(inputs))

tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)


In [25]:
#showing it naively produces the wrong outputs
print(sa_v1(inputs))

tensor([[1.0177, 0.5637],
        [1.0427, 0.5773],
        [1.0428, 0.5774],
        [1.0153, 0.5620],
        [1.0286, 0.5701],
        [1.0162, 0.5622]], grad_fn=<MmBackward0>)


In [26]:
# assigning the v2 parameters to the W matrices of the v1 instance
# Transpose the weight matrix of nn.Linear object, then turn it back into a nn.Parameter object
w_query_temp = sa_v2.W_query.weight
w_key_temp = sa_v2.W_key.weight
w_value_temp = sa_v2.W_value.weight
print(nn.Parameter(w_query_temp.T))

sa_v1.W_query = nn.Parameter(w_query_temp.T)
sa_v1.W_key = nn.Parameter(w_key_temp.T)
sa_v1.W_value = nn.Parameter(w_value_temp.T)

Parameter containing:
tensor([[ 0.3161, -0.1683],
        [ 0.4568, -0.3379],
        [ 0.5118, -0.0918]], requires_grad=True)


In [27]:
# verify that the v1 output now matches the v2 output
print(sa_v1(inputs))

tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)


In [28]:
queries = sa_v2.W_query(inputs)
keys = sa_v2.W_key(inputs)
attn_scores = queries @ keys.T
attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
print(attn_weights)

tensor([[0.1921, 0.1646, 0.1652, 0.1550, 0.1721, 0.1510],
        [0.2041, 0.1659, 0.1662, 0.1496, 0.1665, 0.1477],
        [0.2036, 0.1659, 0.1662, 0.1498, 0.1664, 0.1480],
        [0.1869, 0.1667, 0.1668, 0.1571, 0.1661, 0.1564],
        [0.1830, 0.1669, 0.1670, 0.1588, 0.1658, 0.1585],
        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],
       grad_fn=<SoftmaxBackward0>)


In [29]:
context_length = attn_weights.shape[0]
mask_simple = torch.tril(torch.ones(context_length, context_length))
print(mask_simple)

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])


In [30]:
torch.manual_seed(123)
dropout = torch.nn.Dropout(0.5)
examples = torch.ones(6,6)
print(dropout(examples))

tensor([[2., 2., 0., 2., 2., 0.],
        [0., 0., 0., 2., 0., 2.],
        [2., 2., 2., 2., 0., 2.],
        [0., 2., 2., 0., 0., 2.],
        [0., 2., 0., 2., 0., 2.],
        [0., 2., 2., 2., 2., 0.]])


In [31]:
torch.manual_seed(123)
print(dropout(attn_weights))

tensor([[0.3843, 0.3293, 0.0000, 0.3100, 0.3442, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.2992, 0.0000, 0.2955],
        [0.4071, 0.3318, 0.3325, 0.2996, 0.0000, 0.2961],
        [0.0000, 0.3334, 0.3337, 0.0000, 0.0000, 0.3128],
        [0.0000, 0.3337, 0.0000, 0.3177, 0.0000, 0.3169],
        [0.0000, 0.3327, 0.3331, 0.3084, 0.3331, 0.0000]],
       grad_fn=<MulBackward0>)


In [32]:
batch = torch.stack((inputs, inputs), dim=0)
print(batch.shape)

torch.Size([2, 6, 3])


### Chapter 4 Exercises and Workspace

In [39]:
import tiktoken

tokenizer = tiktoken.get_encoding('gpt2')
batch = []
txt1 = 'Every effort moves you'
txt2 = 'Every day holds a'

batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])


In [33]:
from model_architecture import TransformerBlock

GPT_CONFIG_124M = {
    'vocab_size': 50257,
    'context_length': 1024,
    'emb_dim': 768,
    'n_heads': 12,
    'n_layers': 12,
    'drop_rate': 0.1,
    'qkv_bias': False,
}

In [34]:
torch.manual_seed(123)
x = torch.rand(2, 4, 768)
block = TransformerBlock(GPT_CONFIG_124M)
output = block(x)

In [35]:
print(f"Input shape: {x.shape}")
print(f"Output shape: {output.shape}")

Input shape: torch.Size([2, 4, 768])
Output shape: torch.Size([2, 4, 768])


In [36]:
from model_architecture import GPTModel
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)

In [40]:
out = model(batch)
print(f"Input batch: \n{batch}")
print(f"\nOutput shape: {out.shape}")
print(out)

Input batch: 
tensor([[6109, 3626, 6100,  345],
        [6109, 1110, 6622,  257]])

Output shape: torch.Size([2, 4, 50257])
tensor([[[ 0.3613,  0.4222, -0.0711,  ...,  0.3483,  0.4661, -0.2838],
         [-0.1792, -0.5660, -0.9485,  ...,  0.0477,  0.5181, -0.3168],
         [ 0.7120,  0.0332,  0.1085,  ...,  0.1018, -0.4327, -0.2553],
         [-1.0076,  0.3418, -0.1190,  ...,  0.7195,  0.4023,  0.0532]],

        [[-0.2564,  0.0900,  0.0335,  ...,  0.2659,  0.4454, -0.6806],
         [ 0.1230,  0.3653, -0.2074,  ...,  0.7705,  0.2710,  0.2246],
         [ 1.0558,  1.0318, -0.2800,  ...,  0.6936,  0.3205, -0.3178],
         [-0.1565,  0.3926,  0.3288,  ...,  1.2630, -0.1858,  0.0388]]],
       grad_fn=<UnsafeViewBackward0>)


In [41]:
total_params = sum(p.numel() for p in model.parameters())
print(f"Total number of parameters: {total_params:,}")

Total number of parameters: 163,009,536


In [42]:
print("Token embeddings layer shape:", model.tok_emb.weight.shape)
print("Output layer shape:", model.out_head.weight.shape)

Token embeddings layer shape: torch.Size([50257, 768])
Output layer shape: torch.Size([50257, 768])


**Exercise 4.1**

In [64]:
total_params_ff = 0
for el in model.trf_blocks:
    total_params_ff += sum(p.numel() for p in el.ff.parameters())
print(f"Total feed forward paramters: {total_params_ff:,}")
print(f"Feed forward paramters per block: {total_params_ff//12:,}")

total_params_att = 0
for el in model.trf_blocks:
    total_params_att += sum(p.numel() for p in el.att.parameters())
print(f"Total multi-head attention parameters: {total_params_att:,}")
print(f"Multi-head attention parameters per block: {total_params_att//12:,}")

Total feed forward paramters: 56,669,184
Feed forward paramters per block: 4,722,432
Total multi-head attention parameters: 28,320,768
Multi-head attention parameters per block: 2,360,064


**Exercise 4.2**

In [65]:
GPT_CONFIG_MED = {
    'vocab_size': 50257,
    'context_length': 1024,
    'emb_dim': 1024,
    'n_heads': 16,
    'n_layers': 24,
    'drop_rate': 0.1,
    'qkv_bias': False,
}
GPT_CONFIG_LARGE = {
    'vocab_size': 50257,
    'context_length': 1024,
    'emb_dim': 1280,
    'n_heads': 20,
    'n_layers': 36,
    'drop_rate': 0.1,
    'qkv_bias': False,
}
GPT_CONFIG_XLARGE = {
    'vocab_size': 50257,
    'context_length': 1024,
    'emb_dim': 1600,
    'n_heads': 25,
    'n_layers': 48,
    'drop_rate': 0.1,
    'qkv_bias': False,
}

In [66]:
model_med = GPTModel(GPT_CONFIG_MED)
model_large = GPTModel(GPT_CONFIG_LARGE)
model_xlarge = GPTModel(GPT_CONFIG_XLARGE)

In [68]:
total_params_med = sum(p.numel() for p in model_med.parameters())
total_params_large = sum(p.numel() for p in model_large.parameters())
total_params_xlarge = sum(p.numel() for p in model_xlarge.parameters())

print(f"Total number of parameters for GPT-2 Medium: {total_params_med:,}")
print(f"Total number of parameters for GPT-2 Large: {total_params_large:,}")
print(f"Total number of parameters for GPT-2 X-Large: {total_params_xlarge:,}")

Total number of parameters for GPT-2 Medium: 406,212,608
Total number of parameters for GPT-2 Large: 838,220,800
Total number of parameters for GPT-2 X-Large: 1,637,792,000


In [69]:
def generate_text_simple(model, idx, max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)

        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=1)
    return idx