In [1]:
import os
import urllib.request

In [2]:
# Download the file from the URL

url = ("https://raw.githubusercontent.com/rasbt/"
       "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
       "the-verdict.txt")
file_path = "the-verdict.txt"
urllib.request.urlretrieve(url, file_path)

('the-verdict.txt', <http.client.HTTPMessage at 0x19dcda0dc60>)

In [3]:
# read the file
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


## BytePair Encoding

In [4]:
# import byte pair encoding
import tiktoken

In [5]:
#  instantiate the BPE tokenizer
tokenizer = tiktoken.get_encoding('gpt2')

## Data sampling with a sliding window

In [6]:
# tokenize the-verdict.txt
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

tokenized_text = tokenizer.encode(raw_text)
print("Total number of tokens:", len(tokenized_text))


Total number of tokens: 5145


In [10]:
# print the first 10 tokens and their corresponding characters
print("First 10 tokens and their corresponding characters:")
for i in range(10):
    print(tokenizer.decode([tokenized_text[i]]), end=", ")
print()
# print the first 10 tokens
print("First 10 tokens:")
tokenized_text[:10]

First 10 tokens and their corresponding characters:
I,  H, AD,  always,  thought,  Jack,  G, is, burn,  rather, 
First 10 tokens:


[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138]

In [7]:
# implement dataloader
import torch
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, text, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(text)
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i+max_length]
            target_chunk = token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]



In [8]:
# define a dataloader
def create_dataloader(text, max_length=256, stride=128, batch_size=4, 
                      shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding('gpt2')
    dataset = TextDataset(text, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset, 
        batch_size=batch_size, 
        shuffle=shuffle,
        drop_last=drop_last, 
        num_workers=num_workers
        )

    return dataloader

In [11]:
dataloader = create_dataloader(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


## Creating token embeddings

In [12]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [13]:
max_length = 4
dataloader = create_dataloader(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [14]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

Inputs shape:
 torch.Size([8, 4])


In [15]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

print(token_embeddings)

torch.Size([8, 4, 256])
tensor([[[-1.0600e+00, -9.3302e-01, -2.2327e-01,  ..., -1.2109e+00,
           1.2148e-01,  3.4232e-01],
         [-5.9225e-01,  1.4231e+00,  4.2242e-01,  ..., -9.2771e-01,
          -2.9253e-01,  7.7361e-01],
         [-1.8892e+00, -2.3342e+00,  7.3579e-01,  ...,  1.2672e+00,
          -9.5780e-01, -1.4971e+00],
         [-5.0961e-01,  6.9639e-01, -2.2804e+00,  ...,  4.7866e-02,
          -4.1974e-01,  7.2397e-01]],

        [[ 8.2744e-01, -8.1513e-01, -1.2839e+00,  ...,  1.6208e+00,
           3.8374e-01,  1.7807e+00],
         [ 9.3822e-01,  1.1611e+00,  4.5063e-01,  ...,  2.6580e-01,
          -1.4719e+00, -1.6002e+00],
         [-4.9308e-01, -5.9309e-01,  3.1158e-01,  ...,  4.4791e-01,
           1.4027e+00,  6.4769e-01],
         [-2.0452e-01, -1.1304e+00, -2.5886e-01,  ..., -3.2454e-01,
          -1.3746e+00, -9.4640e-02]],

        [[-4.6878e-01, -1.1896e+00,  1.1188e-01,  ...,  1.1004e+00,
          -1.9018e-01,  4.8035e-01],
         [ 7.5792e-01, -1.0

In [16]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

print(pos_embedding_layer.weight)

Parameter containing:
tensor([[ 1.1103,  2.0699,  0.5742,  ...,  0.7350,  2.5275, -1.0930],
        [ 1.0984, -0.6022,  0.8537,  ...,  0.4693,  0.9784, -0.3690],
        [ 1.5486, -1.6799, -1.1305,  ...,  0.9061, -1.3600,  1.6787],
        [-0.3629, -0.3601, -0.7042,  ..., -0.6867, -0.9837,  0.8112]],
       requires_grad=True)


In [17]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

print(pos_embeddings)

torch.Size([4, 256])
tensor([[ 1.1103,  2.0699,  0.5742,  ...,  0.7350,  2.5275, -1.0930],
        [ 1.0984, -0.6022,  0.8537,  ...,  0.4693,  0.9784, -0.3690],
        [ 1.5486, -1.6799, -1.1305,  ...,  0.9061, -1.3600,  1.6787],
        [-0.3629, -0.3601, -0.7042,  ..., -0.6867, -0.9837,  0.8112]],
       grad_fn=<EmbeddingBackward0>)


In [18]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

print(input_embeddings)

torch.Size([8, 4, 256])
tensor([[[ 0.0503,  1.1369,  0.3509,  ..., -0.4759,  2.6490, -0.7507],
         [ 0.5062,  0.8209,  1.2761,  ..., -0.4584,  0.6859,  0.4047],
         [-0.3406, -4.0142, -0.3947,  ...,  2.1732, -2.3178,  0.1816],
         [-0.8725,  0.3363, -2.9847,  ..., -0.6388, -1.4035,  1.5352]],

        [[ 1.9377,  1.2547, -0.7097,  ...,  2.3558,  2.9113,  0.6877],
         [ 2.0367,  0.5589,  1.3044,  ...,  0.7351, -0.4935, -1.9691],
         [ 1.0555, -2.2730, -0.8189,  ...,  1.3540,  0.0427,  2.3264],
         [-0.5674, -1.4905, -0.9631,  ..., -1.0112, -2.3583,  0.7166]],

        [[ 0.6415,  0.8803,  0.6861,  ...,  1.8354,  2.3374, -0.6127],
         [ 1.8564, -1.6628,  0.3091,  ..., -0.6763,  2.1428,  0.0231],
         [ 3.2024, -2.4323,  0.2856,  ...,  2.1143, -3.2175,  0.1553],
         [-1.3803, -1.7892, -1.2512,  ..., -1.0603, -1.3965,  1.5822]],

        ...,

        [[ 0.8608,  2.0606,  0.6843,  ...,  1.2999,  1.3922, -0.0401],
         [ 0.8620, -1.9827, -0.88