In [1]:
with open("the-verdict.txt", 'r', encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of characters: ", len(raw_text))

Total number of characters:  20479


In [3]:
# Simple tokenization using re.split
# First we tokenize by splitting on whitespaces, and punctuations.
#  We add punctuations and words as tokens. Whitespaes are useful as tokens in places where whitespace is of contextual importance like Python
# Then we make the encode and decode functions to save those tokens to token IDs

import re

# Getting tokens from raw_data
preprocessed = re.split(r'([,.:;?!_"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print("Preprocessed training data length:", len(preprocessed))

# Sorting the vocab so the token ids are extracted accordingly
all_words = sorted(list(set(preprocessed)))
all_words.extend(["<|endoftext|>", "<|unk|>"]) # Add special tokens
vocab_size = len(all_words)
print("Sorted vocabulary: ", vocab_size)

# Creating vocabulary with token ids
vocab = {token: integer for integer, token in enumerate(all_words)}

# Creating a tokenizer class to automatically encode & decode

class SimpleTokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) # Removes whitespaces before specified punctuation
        return text
    
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
tokenizer = SimpleTokenizer(vocab)
print(tokenizer.encode(text))
print(tokenizer.decode(tokenizer.encode(text)))

Preprocessed training data length: 4690
Sorted vocabulary:  1132
[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


In [4]:
#  Implement Byte Pair Encoding & test it out
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
text = ("Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknwnPlace.")
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)
strings = tokenizer.decode(integers)
print(strings)

text = ("Confinement")
integers = tokenizer.encode(text)
decoded = [tokenizer.decode([integer]) for integer in integers]
print(decoded)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 617, 2954, 77, 675, 27271, 13]
Hello, do you like tea? <|endoftext|> In the sunlit terraces of someunknwnPlace.
['Con', 'fin', 'ement']


In [18]:
# Data Sampling with Sliding Window
# Implement a DataLoader that converts our training dataset into input-target pairs

import torch
from torch.utils.data import Dataset, DataLoader
# First implement the Dataset class 
class GPTDataset (Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)
        print(len(token_ids))
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i: i + max_length]
            output_chunk = token_ids[i+1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(output_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
    

# DataLoader class
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDataset(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    return dataloader


In [None]:
with open("the-verdict.txt", 'r', encoding="utf-8") as f:
    raw_text = f.read()

dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
# Inputs is a tensor of input token ids, and targets is a tensor of target token ids


5145


In [None]:
# Embedding layer
# We put vocab_size as 50k because gpt pretrained tokenizer has 50k tokens which we are using
# When creating token ids, it might assign subwords to any tokens between 0 - 50k
# So we need an embedding table that has embedding values for all the tokens to easily extract the tokens we want.
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim) # A learnable lookup table
token_embeddings = token_embedding_layer(inputs) # Inputs is a tensor of token ids, passing it to the lookup table
print(token_embeddings.shape)

torch.Size([8, 4, 256])


# 