# Data Sampling using sliding window with Dataset(Dataset is given by pytorch)

### import required pkg

In [1]:
import numpy as np
import torch
import tiktoken
from torch.utils.data import Dataset, DataLoader

## load the data

In [2]:
with open("data.txt", "r") as file:
    raw_text = file.read()

## create a dataset with input and target tokens

In [4]:
class DreamLLMDataset(Dataset):
    def __init__(self, raw_text, context_length = 10,stride = 1):
        #create the tokenizer
        tokenizer = tiktoken.get_encoding('gpt2')

        # create the token
        tokens = tokenizer.encode(raw_text)

        # store the input and target tensors
        self.input_ids = []
        self.target_ids = []

        # create the input and output tensors 
        for i in range(0, len(tokens) - context_length, stride):
            input_tokens = tokens[i: i + context_length]
            target_tokens = tokens[i+1: i + context_length + 1]

            self.input_ids.append(torch.tensor(input_tokens))
            self.target_ids.append(torch.tensor(target_tokens))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

In [5]:
# create the dataset
dataset = DreamLLMDataset(raw_text = raw_text)

#create a dataloader
data_loader = DataLoader(dataset, batch_size=5)

In [6]:
# get the iterator
data_loader_iterator = iter(data_loader)

In [7]:
# get first batch
next(data_loader_iterator)

[tensor([[44484,   373,  3726,   284,   651,   845, 10032,   286,  5586,   416],
         [  373,  3726,   284,   651,   845, 10032,   286,  5586,   416,   607],
         [ 3726,   284,   651,   845, 10032,   286,  5586,   416,   607,  6621],
         [  284,   651,   845, 10032,   286,  5586,   416,   607,  6621,   198],
         [  651,   845, 10032,   286,  5586,   416,   607,  6621,   198,   261]]),
 tensor([[  373,  3726,   284,   651,   845, 10032,   286,  5586,   416,   607],
         [ 3726,   284,   651,   845, 10032,   286,  5586,   416,   607,  6621],
         [  284,   651,   845, 10032,   286,  5586,   416,   607,  6621,   198],
         [  651,   845, 10032,   286,  5586,   416,   607,  6621,   198,   261],
         [  845, 10032,   286,  5586,   416,   607,  6621,   198,   261,   262]])]

In [8]:
# get the next batch
next(data_loader_iterator)

[tensor([[  845, 10032,   286,  5586,   416,   607,  6621,   198,   261,   262],
         [10032,   286,  5586,   416,   607,  6621,   198,   261,   262,  3331],
         [  286,  5586,   416,   607,  6621,   198,   261,   262,  3331,    11],
         [ 5586,   416,   607,  6621,   198,   261,   262,  3331,    11,   290],
         [  416,   607,  6621,   198,   261,   262,  3331,    11,   290,   286]]),
 tensor([[10032,   286,  5586,   416,   607,  6621,   198,   261,   262,  3331],
         [  286,  5586,   416,   607,  6621,   198,   261,   262,  3331,    11],
         [ 5586,   416,   607,  6621,   198,   261,   262,  3331,    11,   290],
         [  416,   607,  6621,   198,   261,   262,  3331,    11,   290,   286],
         [  607,  6621,   198,   261,   262,  3331,    11,   290,   286,  1719]])]