# installation

In [2]:
!pip install tiktoken



# byte pair encoding

In [2]:
import tiktoken

print(tiktoken.__version__)

0.11.0


In [3]:
tokenizer = tiktoken.encoding_for_model("gpt-2")
print(tokenizer)
# dir(tiktoken)

<Encoding 'gpt2'>


In [10]:
test = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
 )
integers = tokenizer.encode(test, allowed_special={"<|endoftext|>"})
print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


In [11]:
tokenizer.decode(integers)

'Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.'

In [13]:
tokenizer.decode(tokenizer.encode(test, allowed_special={"<|endoftext|>"}))

'Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.'

# Data Sampling with Sliding Window

In [4]:
with open("./../BookAndDataFiles/txtfiles/test", "r", encoding = "utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

688


In [38]:
enc_sample = enc_text[:50]
print(tokenizer.decode(enc_sample))

like a rag doll.

Today our big brains pay off nicely, because we can produce cars and guns that enable us to move much faster than chimps, and shoot them from a safe distance instead of wrestling. But cars and guns are a


In [40]:
context_size = 4
for i in range(1, context_size+1):
    x = enc_sample[:i]
    y = enc_sample[i]
    print(f'{x} --> {y}')

[2339] --> 257
[2339, 257] --> 34232
[2339, 257, 34232] --> 3654
[2339, 257, 34232, 3654] --> 13


In [39]:
for i in range(1, context_size+1):
    x = enc_text[:i]
    y = enc_text[i]
    print(f'{tokenizer.decode(x)} --> {tokenizer.decode([y])}')

like -->  a
like a -->  rag
like a rag -->  doll
like a rag doll --> .


In [42]:
import torch
from torch.utils.data import DataLoader, Dataset

class GPT_Dataset(Dataset):
    def __init__(self, tokenizer, txt, max_len, stride):
        self.input_ids = []
        self.target_ids = []

        encoded_text = tokenizer.encode(txt)
        for i in range(0, len(encoded_text)-max_len, stride):
            input_chunk = encoded_text[i : i+max_len]
            target_chunk = encoded_text[i+1 : i+max_len+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [43]:
def create_data_loader(txt, max_len = 256, stride = 128, batch_size= 8,
                       shuffle=True, num_workers=0, drop_last=True):

    tokenizer = tiktoken.encoding_for_model("gpt-2")
    dataset = GPT_Dataset(tokenizer, txt, max_len, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        drop_last=drop_last
    )

    return dataloader

In [74]:
with open("./../BookAndDataFiles/txtfiles/test", "r", encoding = "utf-8") as f:
    raw_text = f.read()

dataloader = create_data_loader(txt = raw_text, max_len = 5, stride = 2, batch_size = 1, shuffle = False)
data_iter = iter(dataloader)

In [75]:
first_batch = next(data_iter)
print("Hello")
print(first_batch)

Hello
[tensor([[ 2339,   257, 34232,  3654,    13]]), tensor([[  257, 34232,  3654,    13,   198]])]


In [76]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[34232,  3654,    13,   198,   198]]), tensor([[3654,   13,  198,  198, 8888]])]


In [80]:
dataloader = create_data_loader(txt = raw_text, max_len = 5, stride = 2, batch_size = 8, shuffle = False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print(f'inputs: \n{inputs}\n')
print(f'targets: \n{targets}')

inputs: 
tensor([[ 2339,   257, 34232,  3654,    13],
        [34232,  3654,    13,   198,   198],
        [   13,   198,   198,  8888,   674],
        [  198,  8888,   674,  1263, 14290],
        [  674,  1263, 14290,  1414,   572],
        [14290,  1414,   572, 16576,    11],
        [  572, 16576,    11,   780,   356],
        [   11,   780,   356,   460,  4439]])

targets: 
tensor([[  257, 34232,  3654,    13,   198],
        [ 3654,    13,   198,   198,  8888],
        [  198,   198,  8888,   674,  1263],
        [ 8888,   674,  1263, 14290,  1414],
        [ 1263, 14290,  1414,   572, 16576],
        [ 1414,   572, 16576,    11,   780],
        [16576,    11,   780,   356,   460],
        [  780,   356,   460,  4439,  5006]])


# creating token embeddings

In [81]:
vocab_size = 6
output_dim = 3

In [82]:
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [85]:
print(embedding_layer(torch.tensor([3])))

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)


# positional embedding

In [87]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

with open("./../BookAndDataFiles/txtfiles/Homo Deus", "r", encoding = "utf-8") as f:
    raw_text = f.read()

max_length = 4
dataloader = create_data_loader(txt = raw_text, max_len = max_length, batch_size = 8, shuffle = False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print(f'inputs: \n{inputs}')
print(f'inputs shape: {inputs.shape}')

inputs: 
tensor([[   35,   276,  3299,   198],
        [10465,   406,  4629,  6779],
        [  290, 27714,   663,  2951],
        [ 9862,    11, 23684,   290],
        [  262,   938,  1178,  4647],
        [  674, 23162,   995,   447],
        [ 8208,    12, 11085,  4289],
        [  389,   356,  1016,   284]])
inputs shape: torch.Size([8, 4])


In [89]:
token_embeddings = token_embedding_layer(inputs)
token_embeddings.shape

torch.Size([8, 4, 256])

In [93]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))

input_embeddings = token_embeddings + pos_embeddings

# Summarise

In [95]:
import tiktoken
import torch
from pathlib import Path
import os
from torch.utils.data import DataLoader, Dataset

txt_files_path = "./../BookAndDataFiles/txtfiles/"

class GPT_Dataset(Dataset):
    def __init__(self, tokenizer, txt_files, max_len, stride):
        # text_files is a list of text file_names
        self.input_ids = []
        self.target_ids = []

        for txt in txt_files:
            with open(Path(os.path.join(txt_files_path, txt)).resolve(), "r", encoding = "utf-8") as f:
                content = f.read()
            encoded_text = tokenizer.encode(content, allowed_special={"<|endoftext|>"})
            for i in range(0, len(encoded_text)-max_len, stride):
                input_chunk = encoded_text[i : i+max_len]
                target_chunk = encoded_text[i+1 : i+max_len+1]
                self.input_ids.append(torch.tensor(input_chunk))
                self.target_ids.append(torch.tensor(target_chunk))


    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_data_loader(txt_files, max_len = 256, stride = 128, batch_size= 8,
                       shuffle=True, num_workers=0, drop_last=True):

    tokenizer = tiktoken.encoding_for_model("gpt-2")
    dataset = GPT_Dataset(tokenizer, txt_files, max_len, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        drop_last=drop_last
    )

    return dataloader

In [102]:
# test
txt_files = []
max_length = 4

for txt_file_name in os.listdir(txt_files_path):
    if txt_file_name == "test":
        continue
    txt_files.append(txt_file_name)

dataloader = create_data_loader(txt_files = txt_files, max_len = max_length, batch_size = 8, shuffle = False, stride = 1, num_workers = 0)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print(f'inputs: \n{inputs}')

inputs: 
tensor([[15842,   198,   198, 27245],
        [  198,   198, 27245,  7994],
        [  198, 27245,  7994,   262],
        [27245,  7994,   262,  4897],
        [ 7994,   262,  4897,  7994],
        [  262,  4897,  7994,   262],
        [ 4897,  7994,   262,  6434],
        [ 7994,   262,  6434,  4418]])


In [100]:
len(dataloader)

537184

In [103]:
context_length = max_length

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

token_embeddings = token_embedding_layer(inputs)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))

input_embeddings = token_embeddings + pos_embeddings

print(input_embeddings.shape)

torch.Size([8, 4, 256])
