In [1]:
import torch
import tiktoken
from torch.utils.data import Dataset, DataLoader

In [2]:
class CreateGPTDatasetV1(Dataset):
    def __init__(self, txt: str, tokenizer: tiktoken.Encoding, context_length: int, stride: int):
        
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})


        # Split into input and target using a sliding window approach
        for i in range(0, len(token_ids)-context_length, stride):

            input_chunk = token_ids[i:i+context_length]
            output_chunk = token_ids[i+1:i+1+context_length]

            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(output_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [3]:
txt = "Mumbai Indians is the most successful team in Indian Premier League. They have won the title five times."
tokenizer = tiktoken.get_encoding('gpt2')
context_length = 4
stride = 1

In [4]:
txt_encoded = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
txt_encoded

[44,
 21645,
 12746,
 318,
 262,
 749,
 4388,
 1074,
 287,
 3942,
 9952,
 4041,
 13,
 1119,
 423,
 1839,
 262,
 3670,
 1936,
 1661,
 13]

In [5]:
dataset = CreateGPTDatasetV1(txt, tokenizer, context_length, stride)

In [8]:
len(dataset)

17

In [9]:
dataset.input_ids

[tensor([   44, 21645, 12746,   318]),
 tensor([21645, 12746,   318,   262]),
 tensor([12746,   318,   262,   749]),
 tensor([ 318,  262,  749, 4388]),
 tensor([ 262,  749, 4388, 1074]),
 tensor([ 749, 4388, 1074,  287]),
 tensor([4388, 1074,  287, 3942]),
 tensor([1074,  287, 3942, 9952]),
 tensor([ 287, 3942, 9952, 4041]),
 tensor([3942, 9952, 4041,   13]),
 tensor([9952, 4041,   13, 1119]),
 tensor([4041,   13, 1119,  423]),
 tensor([  13, 1119,  423, 1839]),
 tensor([1119,  423, 1839,  262]),
 tensor([ 423, 1839,  262, 3670]),
 tensor([1839,  262, 3670, 1936]),
 tensor([ 262, 3670, 1936, 1661])]

In [10]:
dataset.target_ids

[tensor([21645, 12746,   318,   262]),
 tensor([12746,   318,   262,   749]),
 tensor([ 318,  262,  749, 4388]),
 tensor([ 262,  749, 4388, 1074]),
 tensor([ 749, 4388, 1074,  287]),
 tensor([4388, 1074,  287, 3942]),
 tensor([1074,  287, 3942, 9952]),
 tensor([ 287, 3942, 9952, 4041]),
 tensor([3942, 9952, 4041,   13]),
 tensor([9952, 4041,   13, 1119]),
 tensor([4041,   13, 1119,  423]),
 tensor([  13, 1119,  423, 1839]),
 tensor([1119,  423, 1839,  262]),
 tensor([ 423, 1839,  262, 3670]),
 tensor([1839,  262, 3670, 1936]),
 tensor([ 262, 3670, 1936, 1661]),
 tensor([3670, 1936, 1661,   13])]

In [11]:
def create_dataloader_v1(txt, batch_size, max_length, 
                         stride, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = CreateGPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [13]:
dataset = create_dataloader_v1(txt=txt, batch_size=2, max_length=context_length, stride=1)

In [15]:
txt_encoded

[44,
 21645,
 12746,
 318,
 262,
 749,
 4388,
 1074,
 287,
 3942,
 9952,
 4041,
 13,
 1119,
 423,
 1839,
 262,
 3670,
 1936,
 1661,
 13]

In [14]:
data_ = iter(dataset)
first_batch = next(data_)
first_batch

[tensor([[ 287, 3942, 9952, 4041],
         [ 423, 1839,  262, 3670]]),
 tensor([[3942, 9952, 4041,   13],
         [1839,  262, 3670, 1936]])]

1 minibatch consisting 2 samples, inputs 1 and 2 followed by targets 1 and 2

More No of batches is good as it accelerates training
 

The max values of stride =  Context Length, bcz all tokens get utilised and it would avoid overlap of tokens amongst batches as it would increase overfitting 