# Data Sampling using Sliding Window

## import required pkg

In [21]:
import numpy as np
import torch
import tiktoken

## load the data

In [22]:
with open("data.txt", "r") as file:
    raw_text = file.read()

In [23]:
raw_text[:100]

'Alice was beginning to get very tired of sitting by her sister\non the bank, and of having nothing to'

In [24]:
# length of context window
CONTEXT_LENGTH = 10

# how many token to jump or skip to find next iteration
STRIDE = 1

## create tokenizer

In [25]:
tokenizer =  tiktoken.get_encoding("gpt2")

In [26]:
# tokenize the raw_text
tokens = tokenizer.encode(raw_text)
len(tokens)

42421

In [27]:
# create sample of 10 words
sample = tokens[:CONTEXT_LENGTH]

In [28]:
# logic to create Sliding window
print(f"input = {tokenizer.decode(sample[:4])}")
print(f"target = {tokenizer.decode(sample[4:4+1])}")

input = Alice was beginning to
target =  get


In [29]:
for i in range(1, len(sample), STRIDE):
    print(f"{sample[:i]} => {sample[i:i+1]}")

[44484] => [373]
[44484, 373] => [3726]
[44484, 373, 3726] => [284]
[44484, 373, 3726, 284] => [651]
[44484, 373, 3726, 284, 651] => [845]
[44484, 373, 3726, 284, 651, 845] => [10032]
[44484, 373, 3726, 284, 651, 845, 10032] => [286]
[44484, 373, 3726, 284, 651, 845, 10032, 286] => [5586]
[44484, 373, 3726, 284, 651, 845, 10032, 286, 5586] => [416]


In [30]:
for i in range(1, len(sample), STRIDE):
    print(f"{tokenizer.decode(sample[:i])} => {tokenizer.decode(sample[i:i+1])}")

Alice =>  was
Alice was =>  beginning
Alice was beginning =>  to
Alice was beginning to =>  get
Alice was beginning to get =>  very
Alice was beginning to get very =>  tired
Alice was beginning to get very tired =>  of
Alice was beginning to get very tired of =>  sitting
Alice was beginning to get very tired of sitting =>  by


# get entire data into sliding window

In [39]:
# store all input chunks
input_ids = []

# store all target chunks
target_ids = []

for i in range(1, len(tokens) - CONTEXT_LENGTH, STRIDE):
    input_chunk = tokens[i : i + CONTEXT_LENGTH]
    target_chunk = tokens[i+1 : i + CONTEXT_LENGTH + 1]

    # stire all input and taerget chunks
    input_ids.append(torch.tensor(input_chunk))
    target_ids.append(torch.tensor(target_chunk))

In [40]:
len(input_ids), len(target_ids)

(42410, 42410)

In [41]:
input_ids[:10]

[tensor([  373,  3726,   284,   651,   845, 10032,   286,  5586,   416,   607]),
 tensor([ 3726,   284,   651,   845, 10032,   286,  5586,   416,   607,  6621]),
 tensor([  284,   651,   845, 10032,   286,  5586,   416,   607,  6621,   198]),
 tensor([  651,   845, 10032,   286,  5586,   416,   607,  6621,   198,   261]),
 tensor([  845, 10032,   286,  5586,   416,   607,  6621,   198,   261,   262]),
 tensor([10032,   286,  5586,   416,   607,  6621,   198,   261,   262,  3331]),
 tensor([ 286, 5586,  416,  607, 6621,  198,  261,  262, 3331,   11]),
 tensor([5586,  416,  607, 6621,  198,  261,  262, 3331,   11,  290]),
 tensor([ 416,  607, 6621,  198,  261,  262, 3331,   11,  290,  286]),
 tensor([ 607, 6621,  198,  261,  262, 3331,   11,  290,  286, 1719])]

In [42]:
target_ids[:10]

[tensor([ 3726,   284,   651,   845, 10032,   286,  5586,   416,   607,  6621]),
 tensor([  284,   651,   845, 10032,   286,  5586,   416,   607,  6621,   198]),
 tensor([  651,   845, 10032,   286,  5586,   416,   607,  6621,   198,   261]),
 tensor([  845, 10032,   286,  5586,   416,   607,  6621,   198,   261,   262]),
 tensor([10032,   286,  5586,   416,   607,  6621,   198,   261,   262,  3331]),
 tensor([ 286, 5586,  416,  607, 6621,  198,  261,  262, 3331,   11]),
 tensor([5586,  416,  607, 6621,  198,  261,  262, 3331,   11,  290]),
 tensor([ 416,  607, 6621,  198,  261,  262, 3331,   11,  290,  286]),
 tensor([ 607, 6621,  198,  261,  262, 3331,   11,  290,  286, 1719]),
 tensor([6621,  198,  261,  262, 3331,   11,  290,  286, 1719, 2147])]