# **Creating Input-Target pairs**

NB: lecture note found here **[🔹 Lecture 9 Notes 🔹](lecture_9_notes.md)**


implementing simple tokenization again from scratch again

In [None]:
import tiktoken
import torch

In [7]:
# read text file
with open("./data/the-verdict.txt", 'r', encoding='utf-8') as f:
    raw_text = f.read()

In [8]:
# we dont really care about the vocabs in the dataset for now but if you want to get is use the below
from word_based_tokenizer import SimpleWordBasedTokenizer

word_tokenizer = SimpleWordBasedTokenizer(split_regex=r'([,.:;?_!"\'()]|--|\s)', sub_regex=r'\s+([,.?!"()\'])')
vocabs = word_tokenizer.create_vocabs(raw_text=raw_text)
print(vocabs)

{'!': 0, '"': 1, "'": 2, '(': 3, ')': 4, ',': 5, '--': 6, '.': 7, ':': 8, ';': 9, '?': 10, 'A': 11, 'Ah': 12, 'Among': 13, 'And': 14, 'Are': 15, 'Arrt': 16, 'As': 17, 'At': 18, 'Be': 19, 'Begin': 20, 'Burlington': 21, 'But': 22, 'By': 23, 'Carlo': 24, 'Chicago': 25, 'Claude': 26, 'Come': 27, 'Croft': 28, 'Destroyed': 29, 'Devonshire': 30, 'Don': 31, 'Dubarry': 32, 'Emperors': 33, 'Florence': 34, 'For': 35, 'Gallery': 36, 'Gideon': 37, 'Gisburn': 38, 'Gisburns': 39, 'Grafton': 40, 'Greek': 41, 'Grindle': 42, 'Grindles': 43, 'HAD': 44, 'Had': 45, 'Hang': 46, 'Has': 47, 'He': 48, 'Her': 49, 'Hermia': 50, 'His': 51, 'How': 52, 'I': 53, 'If': 54, 'In': 55, 'It': 56, 'Jack': 57, 'Jove': 58, 'Just': 59, 'Lord': 60, 'Made': 61, 'Miss': 62, 'Money': 63, 'Monte': 64, 'Moon-dancers': 65, 'Mr': 66, 'Mrs': 67, 'My': 68, 'Never': 69, 'No': 70, 'Now': 71, 'Nutley': 72, 'Of': 73, 'Oh': 74, 'On': 75, 'Once': 76, 'Only': 77, 'Or': 78, 'Perhaps': 79, 'Poor': 80, 'Professional': 81, 'Renaissance': 82, 'Ri

In [12]:
# enc_text = tiktoken.get_encoding(encoding_name='gpt2').encode(raw_text) # this way to will work as well

gpt2_tokenizer = tiktoken.get_encoding(encoding_name='gpt2')
enc_text = gpt2_tokenizer.encode(raw_text)
print(raw_text[:99])
print(enc_text[:99])
print(len(enc_text))

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 
[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 2016, 257, 922, 5891, 1576, 438, 568, 340, 373, 645, 1049, 5975, 284, 502, 284, 3285, 326, 11, 287, 262, 6001, 286, 465, 13476, 11, 339, 550, 5710, 465, 12036, 11, 6405, 257, 5527, 27075, 11, 290, 4920, 2241, 287, 257, 4489, 64, 319, 262, 34686, 41976, 13, 357, 10915, 314, 2138, 1807, 340, 561, 423, 587, 10598, 393, 28537, 2014, 198, 198, 1, 464, 6001, 286, 465, 13476, 1, 438, 5562, 373, 644, 262, 1466, 1444, 340, 13, 314, 460, 3285, 9074, 13, 46606]
5145


In [14]:
enc_sample = enc_text[50:]

- **Context size:** refers to how many words/tokens you want to give as input, for the model to make it's prediction
- The context size determines how many tokens are included in the input

- To think of it intuitively, **context size is basically how many words/tokens the model should pay attention at one time to predict the next word/token**

In [15]:
context_size = 4 # length of the input
#The context_size of 4 means that the model is trained to look at a sequence of 4 words (or tokens) 
#to predict the next word in the sequence. 
#The input x is the first 4 tokens [1, 2, 3, 4], and the target y is the next 4 tokens [2, 3, 4, 5]

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"y: {x}")
print(f"y:      {y}")


y: [290, 4920, 2241, 287]
y:      [4920, 2241, 287, 257]


**NB: one input-output pair corresponds to the number of prediction task, as said by the `context size`**

In [16]:
# processing inputs along with targets
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    
    print(f"{context} ----> {desired}")

[290] ----> 4920
[290, 4920] ----> 2241
[290, 4920, 2241] ----> 287
[290, 4920, 2241, 287] ----> 257


In [20]:
# let decode the text format for this
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    
    print(f"{gpt2_tokenizer.decode(context)} ----> {gpt2_tokenizer.decode([desired])}")

 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a


### **IMPLEMENT DATA LOADER**

for efficient data loader implementation, we'll use PyTorch's built-in Dataset and DataLoader
PyTorch dataloaders and datasets: 
- https://docs.pytorch.org/tutorials/beginner/basics/data_tutorial.html
- https://docs.pytorch.org/docs/stable/data.html


**Steps**

**Step 1**: Tokenize the entire text
    
**Step 2**: Use a sliding window to chunk the book into overlapping sequences of max_length

**Step 3**: Return the total number of rows in the dataset

**Step 4**: Return a single row from the dataset

In [8]:
from torch.utils.data import Dataset, DataLoader
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        
        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        
        # We divide the text into overlapping chunks of `max_length` tokens using a sliding window.
        # For each chunk:
        #   - input_chunk is tokens[i : i + max_length]
        #   - target_chunk is tokens[i+1 : i + max_length + 1], which is input shifted right by one
        # The subtraction by max_length ensures the final chunk ends cleanly without going out of bounds.
        # The `stride` controls how much overlap there is between consecutive chunks.
        # ...stops before the final window that would exceed len(token_ids) when grabbing target_chunk.
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1 : i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            
    def __len__(self):
        return len(self.input_ids)
       
    # this function tells the dataloader what kind of input and output should we have... this is for the pytorch dataloader to do its own stuff     
    def __getitem__(self,idx):
        return self.input_ids[idx], self.target_ids[idx]

Step 1: Initialize the tokenizer

Step 2: Create dataset

Step 3: drop_last=True drops the last batch if it is shorter than the specified batch_size to prevent loss spikes
during training

Step 4: The number of CPU processes to use for preprocessing

In [3]:
def create_dataloader_v1(
    txt, 
    batch_size=4,
    max_length=256,
    stride=128,
    shuffle=True,
    drop_last=True,
    num_workers=0
):
    
    # initialize the tokenizer
    tokenizer = tiktoken.get_encoding(encoding_name='gpt2')
    
    # create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    
    # create dataloader
    # this dataloader is going to access the __getitem__ in the dataset and create input-output tensors
    dataloader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
        
    )
    
    return dataloader

test the dataloader with batch size of 1 for th an LLm with a context size of 4,

In [4]:
with open("./data/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [29]:
!python.exe -m pip install --upgrade pip

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ----- ---------------------------------- 0.3/1.8 MB ? eta -:--:--
   ---------------------------------------- 1.8/1.8 MB 5.5 MB/s eta 0:00:00
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 25.0.1
    Uninstalling pip-25.0.1:
      Successfully uninstalled pip-25.0.1
Successfully installed pip-25.1.1


In [1]:
import torch
print(torch.__version__)

2.5.1


In [9]:
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [10]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[ 367, 2885, 1464, 1807]]), tensor([[2885, 1464, 1807, 3619]])]


In [11]:
dataloader2 = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)
data_iter2 = iter(dataloader2)
inputs,targets = next(data_iter2)
print(f"Inputs:\n {inputs}")
print(f"Targets:\n {targets}")

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])
Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])
