# Data Sampling with a sliding Window

In [None]:
import tiktoken


In [11]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
tokensier = tiktoken.get_encoding("gpt2")
enc_text = tokensier.encode(raw_text)

print(f"Length of text in tokens: {len(enc_text)}")

Length of text in tokens: 5145


In [3]:
enc_text

[40,
 367,
 2885,
 1464,
 1807,
 3619,
 402,
 271,
 10899,
 2138,
 257,
 7026,
 15632,
 438,
 2016,
 257,
 922,
 5891,
 1576,
 438,
 568,
 340,
 373,
 645,
 1049,
 5975,
 284,
 502,
 284,
 3285,
 326,
 11,
 287,
 262,
 6001,
 286,
 465,
 13476,
 11,
 339,
 550,
 5710,
 465,
 12036,
 11,
 6405,
 257,
 5527,
 27075,
 11,
 290,
 4920,
 2241,
 287,
 257,
 4489,
 64,
 319,
 262,
 34686,
 41976,
 13,
 357,
 10915,
 314,
 2138,
 1807,
 340,
 561,
 423,
 587,
 10598,
 393,
 28537,
 2014,
 198,
 198,
 1,
 464,
 6001,
 286,
 465,
 13476,
 1,
 438,
 5562,
 373,
 644,
 262,
 1466,
 1444,
 340,
 13,
 314,
 460,
 3285,
 9074,
 13,
 46606,
 536,
 5469,
 438,
 14363,
 938,
 4842,
 1650,
 353,
 438,
 2934,
 489,
 3255,
 465,
 48422,
 540,
 450,
 67,
 3299,
 13,
 366,
 5189,
 1781,
 340,
 338,
 1016,
 284,
 3758,
 262,
 1988,
 286,
 616,
 4286,
 705,
 1014,
 510,
 26,
 475,
 314,
 836,
 470,
 892,
 286,
 326,
 11,
 1770,
 13,
 8759,
 2763,
 438,
 1169,
 2994,
 284,
 943,
 17034,
 318,
 477,
 314,
 892,


In [5]:
enc_sample = enc_text[50:]


In [None]:
# for our purpose, we will use a context size of 4 tokens, in gpt-2 it's 1024 tokens
context_size = 4
x = enc_sample[:context_size] # input tokens
y = enc_sample[1 : context_size + 1] # next token prediction
print(f"x:{x}")
print(f"y:     {y}")

x:[290, 4920, 2241, 287]
y:     [4920, 2241, 287, 257]


In [13]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "->", desired)
    print(tokensier.decode(context), "->", tokensier.decode([desired]), "\n")
    

[290] -> 4920
 and ->  established 

[290, 4920] -> 2241
 and established ->  himself 

[290, 4920, 2241] -> 287
 and established himself ->  in 

[290, 4920, 2241, 287] -> 257
 and established himself in ->  a 



In [10]:
import torch
torch.__version__

'2.9.1'

In [16]:
from torch.utils.data import Dataset, DataLoader

class GPTDataSetV1(Dataset):
    def __init__(self, text, tokensier, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        
        token_ids = tokensier.encode(text, allowed_special={"<|endoftext|>"})

        for i in range(0, len(token_ids) - max_length + 1, stride):
            input_chunk = token_ids[i : i + max_length]
            target_chunk = token_ids[i + 1 : i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk, dtype=torch.long))
            self.target_ids.append(torch.tensor(target_chunk, dtype=torch.long))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
       return self.input_ids[idx], self.target_ids[idx]

In [17]:
def create_dataloader_v1(text, batch_size, max_length=256, stride=128,
                          shuffle=True, drop_last=True, num_workers=0):
    # initialize the tokenizer
    tokensier= tiktoken.get_encoding("gpt2")

    # create the dataset
    dataset = GPTDataSetV1(text, tokensier, max_length, stride)

    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers)
    return dataloader

In [18]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [32]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=2, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
# second_batch = next(data_iter)
# print(first_batch)
print("Inputs:\n", inputs)
print("Targets:\n", targets)
# print(second_batch)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 2885,  1464,  1807,  3619],
        [ 1807,  3619,   402,   271],
        [  402,   271, 10899,  2138],
        [10899,  2138,   257,  7026],
        [  257,  7026, 15632,   438],
        [15632,   438,  2016,   257],
        [ 2016,   257,   922,  5891]])
Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 1464,  1807,  3619,   402],
        [ 3619,   402,   271, 10899],
        [  271, 10899,  2138,   257],
        [ 2138,   257,  7026, 15632],
        [ 7026, 15632,   438,  2016],
        [  438,  2016,   257,   922],
        [  257,   922,  5891,  1576]])
