#  Exercise 2.1 Byte pair encoding of unknown words #
By Shamsher

In [1]:
from importlib.metadata import version
import tiktoken

print(f'the version of tiktoken is {tiktoken.__version__}')

the version of tiktoken is 0.9.0


In [2]:
# initialising tokenizer

tokenizer = tiktoken.get_encoding("gpt2")

In [3]:
text = 'Akwirw ier'

encoded_text = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(encoded_text)

[33901, 86, 343, 86, 220, 959]


In [4]:
for i in encoded_text:
    print(f'{i} --> {tokenizer.decode([i])}')

33901 --> Ak
86 --> w
343 --> ir
86 --> w
220 -->  
959 --> ier


In [5]:
print(tokenizer.encode('Ak'))

[33901]


In [6]:
print(tokenizer.encode('w'))

[86]


In [7]:
print(tokenizer.encode('ir'))

[343]


In [8]:
print(tokenizer.encode('ier'))

[959]


In [9]:
print(tokenizer.encode(' '))

[220]


In [10]:
print(tokenizer.decode(encoded_text))

Akwirw ier


### end of exercise 2.1 ###

# Exercise 2.2 #

Data loaders with different strides and context sizes

In [17]:
 import torch
 from torch.utils.data import Dataset, DataLoader

 class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
     self.input_ids = []
     self.target_ids = []
     token_ids = tokenizer.encode(txt)

     for i in range(0, len(token_ids) - max_length, stride):
        input_chunk = token_ids[i:i + max_length]
        target_chunk = token_ids[i + 1: i + max_length + 1]
        self.input_ids.append(torch.tensor(input_chunk))
        self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [18]:
 def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [19]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [20]:
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [30]:
# experimenting with max_length 2 and stride 2
dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=2, stride=2, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[ 40, 367]]), tensor([[ 367, 2885]])]


In [31]:
# experimenting with max_length 8 and stride 2

dataloader = create_dataloader_v1(raw_text, batch_size=1, max_length=8, stride=2, shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464, 1807, 3619,  402,  271]]), tensor([[  367,  2885,  1464,  1807,  3619,   402,   271, 10899]])]


### end of 2.2 ###