### Chapter 2 problems and Workspace

In [1]:
from importlib.metadata import version
from importlib import reload

import tiktoken
import torch
import torch.nn as nn

In [2]:
print(f"Tiktoken version: {version('tiktoken')}")

Tiktoken version: 0.8.0


In [3]:
tokenizer = tiktoken.get_encoding('gpt2')

In [4]:
text = ("Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.")

In [5]:
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 34680, 27271, 13]


In [6]:
strings = tokenizer.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknownPlace.


**Problem 2.1**

In [7]:
text = 'Akwirw ier'
integers = tokenizer.encode(text)
print(integers)

[33901, 86, 343, 86, 220, 959]


In [8]:
for i in integers:
    print(f"{i}: {tokenizer.decode([i])}")

33901: Ak
86: w
343: ir
86: w
220:  
959: ier


In [9]:
strings = tokenizer.decode(integers)
print(strings)

Akwirw ier


**Problem 2.2**

In [10]:
from data_loaders import create_dataloader_v1

In [11]:
with open('the-verdict.txt', 'r') as file:
    raw_text = file.read()

In [12]:
dataloader = create_dataloader_v1(raw_text, batchsize=8, max_length=4, stride=4, shuffle=False)

In [13]:
data_iter = iter(dataloader)

In [14]:
# Only run this cell for the specific answers to problem 2.2
#first_batch = next(data_iter)
#print(first_batch)
#second_batch = next(data_iter)
#print(second_batch)

In [15]:
inputs, targets = next(data_iter)

In [16]:
print(f"Inputs: \n{inputs}")
print(f"Targets: \n{targets}")

Inputs: 
tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]], dtype=torch.int32)
Targets: 
tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]], dtype=torch.int32)


### Practice Embedding 

In [17]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [18]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [19]:
context_length = 4
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
pos_embeddings.shape

torch.Size([4, 256])

In [20]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])


### Chapter 3 Problems and Workspace

In [24]:
inputs = torch.tensor(
    [[0.43, 0.15, 0.89],
     [0.55, 0.87, 0.66],
     [0.57, 0.85, 0.64],
     [0.22, 0.58, 0.33],
     [0.77, 0.25, 0.10],
     [0.05, 0.80, 0.55]]
)

from attention_heads import SelfAttention_v2
from attention_heads import SelfAttention_v1

In [27]:
torch.manual_seed(789)

<torch._C.Generator at 0x10c3e1a70>

**Problem 3.1**

In [28]:
sa_v2 = SelfAttention_v2(3, 2)
sa_v1 = SelfAttention_v1(3, 2)

In [29]:
print(sa_v2(inputs))

tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)


In [30]:
#showing it naively produces the wrong outputs
print(sa_v1(inputs))

tensor([[1.0177, 0.5637],
        [1.0427, 0.5773],
        [1.0428, 0.5774],
        [1.0153, 0.5620],
        [1.0286, 0.5701],
        [1.0162, 0.5622]], grad_fn=<MmBackward0>)


In [51]:
# assigning the v2 parameters to the W matrices of the v1 instance
# Transpose the weight matrix of nn.Linear object, then turn it back into a nn.Parameter object
w_query_temp = sa_v2.W_query.weight
w_key_temp = sa_v2.W_key.weight
w_value_temp = sa_v2.W_value.weight
print(nn.Parameter(w_query_temp.T))

sa_v1.W_query = nn.Parameter(w_query_temp.T)
sa_v1.W_key = nn.Parameter(w_key_temp.T)
sa_v1.W_value = nn.Parameter(w_value_temp.T)

Parameter containing:
tensor([[ 0.3161, -0.1683],
        [ 0.4568, -0.3379],
        [ 0.5118, -0.0918]], requires_grad=True)


In [52]:
# verify that the v1 output now matches the v2 output
print(sa_v1(inputs))

tensor([[-0.0739,  0.0713],
        [-0.0748,  0.0703],
        [-0.0749,  0.0702],
        [-0.0760,  0.0685],
        [-0.0763,  0.0679],
        [-0.0754,  0.0693]], grad_fn=<MmBackward0>)


In [54]:
queries = sa_v2.W_query(inputs)
keys = sa_v2.W_key(inputs)
attn_scores = queries @ keys.T
attn_weights = torch.softmax(attn_scores / keys.shape[-1]**0.5, dim=-1)
print(attn_weights)

tensor([[0.1921, 0.1646, 0.1652, 0.1550, 0.1721, 0.1510],
        [0.2041, 0.1659, 0.1662, 0.1496, 0.1665, 0.1477],
        [0.2036, 0.1659, 0.1662, 0.1498, 0.1664, 0.1480],
        [0.1869, 0.1667, 0.1668, 0.1571, 0.1661, 0.1564],
        [0.1830, 0.1669, 0.1670, 0.1588, 0.1658, 0.1585],
        [0.1935, 0.1663, 0.1666, 0.1542, 0.1666, 0.1529]],
       grad_fn=<SoftmaxBackward0>)


In [55]:
context_length = attn_weights.shape[0]
mask_simple = torch.tril(torch.ones(context_length, context_length))
print(mask_simple)

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])


In [62]:
torch.manual_seed(123)
dropout = torch.nn.Dropout(0.5)
examples = torch.ones(6,6)
print(dropout(examples))

tensor([[2., 2., 0., 2., 2., 0.],
        [0., 0., 0., 2., 0., 2.],
        [2., 2., 2., 2., 0., 2.],
        [0., 2., 2., 0., 0., 2.],
        [0., 2., 0., 2., 0., 2.],
        [0., 2., 2., 2., 2., 0.]])


In [63]:
torch.manual_seed(123)
print(dropout(attn_weights))

tensor([[0.3843, 0.3293, 0.0000, 0.3100, 0.3442, 0.0000],
        [0.0000, 0.0000, 0.0000, 0.2992, 0.0000, 0.2955],
        [0.4071, 0.3318, 0.3325, 0.2996, 0.0000, 0.2961],
        [0.0000, 0.3334, 0.3337, 0.0000, 0.0000, 0.3128],
        [0.0000, 0.3337, 0.0000, 0.3177, 0.0000, 0.3169],
        [0.0000, 0.3327, 0.3331, 0.3084, 0.3331, 0.0000]],
       grad_fn=<MulBackward0>)


In [64]:
batch = torch.stack((inputs, inputs), dim=0)
print(batch.shape)

torch.Size([2, 6, 3])
