In [10]:
# Reading the text file

with open("the-verdict.txt", "r", encoding="utf=8") as f:
    raw_text = f.read()
print(f"Example of contents in our text file: \n {raw_text[:32]}\n")    
print(f"Length of charachters in our raw text file = {len(raw_text)} charachters")

Example of contents in our text file: 
 I HAD always thought Jack Gisbur

Length of charachters in our raw text file = 20479 charachters


In [11]:
# Byte pair tokenization
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer)

<Encoding 'gpt2'>


In [15]:
# Encoding the text

token_ids = tokenizer.encode(raw_text)
len(token_ids)

5145

## Input-Pair Targets - Foundational Code

- For each text chunk, we want the inputs and targets
- Since we want the model to predict the next word, the targets are the inputs shifted by one position to the right
- context size = How many tokens are included in the input
- Example:
   - X = [1, 2, 3, 4]
   - y = [5, 6, 7, 8]
     - If X = [1]          y = 5
     - If X = [1, 2]       y = 6
     - If X = [1, 2, 3]    y = 7
     - If X = [1, 2, 3, 4] y = 8
   - context size = 4       

In [37]:
# Creating a sample by removing the first 50 tokens

sample_tokens = token_ids[50:]

In [38]:
# Example of the technique for creating input-pair tokens

context_size = 4
# This means the model is trained to look at a sequence of 4 tokens to predict the next word
# For example if x is the first 4 tokens = [1, 2, 3, 4] y next 4 tokens = [2, 3, 4, 5]
X = sample_tokens[:context_size]
y = sample_tokens[1:context_size+1]
print(f"f(x) = {X}")
print(f"f(y) =      {y}")
# Checking the condions
# x = [290]                  : y = 4920
# X = [290, 4920]            : y = [2241]
# X = [290, 4920, 2241]      : y = [287]
# X = [290, 4920, 2241, 287] : y = [257]

f(x) = [290, 4920, 2241, 287]
f(y) =      [4920, 2241, 287, 257]


In [39]:
# Processing the inputs along with the targets where inputs are shifted one position to the right to create next-word prediction task
# Note: Decode works only with a sequence
for i in range(1, context_size+1):
    # Input
    context = sample_tokens[:i]
    # Target
    desired = sample_tokens[i]
    print(f"{context}  ---> {desired}")
    print(f"{tokenizer.decode(context)} ---> {tokenizer.decode([desired])}")

[290]  ---> 4920
 and --->  established
[290, 4920]  ---> 2241
 and established --->  himself
[290, 4920, 2241]  ---> 287
 and established himself --->  in
[290, 4920, 2241, 287]  ---> 257
 and established himself in --->  a


## Implementing the DataLoader(tensors)

![image.png](attachment:60725669-8dbd-4126-ae8b-63bd0f7373ca.png)

## STEPS
--------------------------------
1. Tokenize the entire text
2. Use the sliding window to chunk the  book into overlapping sequences of max length
3. Return the total number of rows in the dataset
4. Return a single row from the dataset

In [54]:
# Testing the logic of the sliding window

import torch
sample_tokens = token_ids[:12]
def slidingWindow(ids,context_size, stride):
    input_ids = []
    output_ids = []
    for i in range(0, len(ids)-context_size, stride):
        input_chunk = ids[i: i + context_size]
        output_chunk = ids[i+1: i+context_size+1]
        input_ids.append(torch.tensor(input_chunk))
        output_ids.append(torch.tensor(output_chunk))
    return output_ids, input_ids

o, i = slidingWindow(sample_tokens, 4, 1)
len(o), len(i)

(8, 8)

In [78]:
from torch.utils.data import Dataset, DataLoader
import torch
class GptDatasetV1:
    def __init__(self, text, tokenizer, context_size, stride):
        # Creation of the X and y arrays
        self.input_ids = []
        self.target_ids = []
        # Encoding the whole raw text
        token_ids = tokenizer.encode(text, allowed_special = {"<|endoftext|>"})
        # Creating the sliding window for the input and output chunks - check notes if you don't understand properly
        for i in range(0, len(token_ids)-context_size, stride):
            input_chunk = token_ids[i:i+context_size]
            output_chunk = token_ids[i+1:i+context_size+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(output_chunk))
    # Checking the length of input        
    def __len__(self):
        return len(self.input_ids)

    # Getting a specific chunk's input + target
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]        
dataset = GptDatasetV1(raw_text[:32], tokenizer, 4, 1)
dataset[:5]

([tensor([  40,  367, 2885, 1464]),
  tensor([ 367, 2885, 1464, 1807]),
  tensor([2885, 1464, 1807, 3619]),
  tensor([1464, 1807, 3619,  402]),
  tensor([1807, 3619,  402,  271])],
 [tensor([ 367, 2885, 1464, 1807]),
  tensor([2885, 1464, 1807, 3619]),
  tensor([1464, 1807, 3619,  402]),
  tensor([1807, 3619,  402,  271]),
  tensor([3619,  402,  271, 6236])])

## DataLoader

- The following code will load the inputs from the GptDatasetv1 into batches using the DataLoader

1. Initialize the tokenizer
2. Create the dataset
3. **drop_last = True** drops the last batch if not equal to the specified batch size to prevent loss spikes during training
4. The number of CPU processes to use for preprocessing

batch_size - Number of batches processed before parameter update
workers    - usage of CPU for parallel processing

In [89]:
# Creating the dataloader function

def dataloaderV1(txt, context_size = 256, stride = 128, drop = True, batch_size = 4, workers = 2, shuffle = True):
    # Tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")
    # Creating the dataset
    dataset = GptDatasetV1(text = txt, tokenizer = tokenizer, context_size = context_size, stride = stride)
    # Creating the dataloader
    dataloader = DataLoader(
        num_workers = workers,
        drop_last = drop,
        batch_size = batch_size,
        dataset = dataset,
        shuffle = shuffle,
    )
    # Returning the Dataloader
    return dataloader

In [None]:
# Testing

dataloader = dataloaderV1(txt = raw_text, context_size = 4, stride = 1, batch_size = 1)
# Iterating through the dataloader
dataloader_iter = iter(dataloader)
first_batch = next(dataloader_iter)
first_batch