### **Creating Input-Output/Target pairs**

In [1]:
import torch
import tiktoken

In [2]:
# read file content
with open ("../data/harry-potter.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
raw_text[:50]

"Harry Potter and the Sorcerer's Stone\n\n\nCHAPTER ON"

In [3]:
# tokenize file content
tokenizer = tiktoken.get_encoding(encoding_name="gpt2")
encoded_raw_text = tokenizer.encode(raw_text)
print(encoded_raw_text[:50])

[18308, 14179, 290, 262, 30467, 338, 8026, 628, 198, 41481, 16329, 198, 198, 10970, 16494, 56, 19494, 406, 3824, 1961, 198, 198, 5246, 13, 290, 9074, 13, 360, 1834, 1636, 11, 286, 1271, 1440, 11, 4389, 16809, 9974, 11, 547, 6613, 284, 910, 198, 5562, 484, 547, 7138, 3487, 11]


In [18]:
# create simple input-target pair with sliding window
context_size = 10

encoded_sample = encoded_raw_text[401:]
x = encoded_sample[:context_size]
y = encoded_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [2215, 1770, 13, 290, 9074, 13, 360, 1834, 1636, 19092]
y:      [1770, 13, 290, 9074, 13, 360, 1834, 1636, 19092, 510]


In [19]:
# visualize visualize visualize
for i in range(1, context_size+1):
    input = encoded_sample[:i]
    
    target = encoded_sample[i]
    
    print(f"{input} ---> {target}")

[2215] ---> 1770
[2215, 1770] ---> 13
[2215, 1770, 13] ---> 290
[2215, 1770, 13, 290] ---> 9074
[2215, 1770, 13, 290, 9074] ---> 13
[2215, 1770, 13, 290, 9074, 13] ---> 360
[2215, 1770, 13, 290, 9074, 13, 360] ---> 1834
[2215, 1770, 13, 290, 9074, 13, 360, 1834] ---> 1636
[2215, 1770, 13, 290, 9074, 13, 360, 1834, 1636] ---> 19092
[2215, 1770, 13, 290, 9074, 13, 360, 1834, 1636, 19092] ---> 510


In [20]:
# let visualise the decoded text
for i in range(1, context_size+1):
    input = encoded_sample[:i]
    target = encoded_sample[i]
    
    print(f"{tokenizer.decode(input)} ---> {tokenizer.decode([target])}")

When --->  Mr
When Mr ---> .
When Mr. --->  and
When Mr. and --->  Mrs
When Mr. and Mrs ---> .
When Mr. and Mrs. --->  D
When Mr. and Mrs. D ---> urs
When Mr. and Mrs. Durs ---> ley
When Mr. and Mrs. Dursley --->  woke
When Mr. and Mrs. Dursley woke --->  up


In [None]:
# now let implement pytorch datasets
from torch.utils.data import DataLoader, Dataset

class GPTDatasetV1(Dataset):
    def __init__(self, raw_text, tokenizer, context_size, stride):
        self.input_ids = []
        self.target_ids = []
        
        # tokenize the raw text
        token_ids = tokenizer.encode(raw_text, allowed_special={"<|endoftext|>"})
        
        # using sliding window to create input-output/target dataset
        # check the main lecture_9.ipynb to see the explaination for why - context_size
        for i in range(0, len(token_ids) - context_size, stride):
            # takes ~ 1.5sec to create datasets
            input_chunk = token_ids[i : i + context_size]
            target_chunk = token_ids[i + 1 : i + context_size + 1]
            
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            
            # much short version of the 4 lines above will be, this is shorter yet takes too much compute as the computer is trying to do so many things as a time... this is slower... takes ~ 10sec to create dataset
            # self.input_ids.append(torch.tensor(token_ids[i : i + context_size]))
            # self.target_ids.append(torch.tensor(token_ids[i + 1 : i + context_size + 1]))
            
    def __len__(self):
        return len(self.input_ids)
    
    # this here for pytorch dataloader to get item in the dataset at a time
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]  

In [37]:
# let create a dataloader to load our dataset
def create_dataloader_v1(
    raw_text,
    batch_size = 4, 
    context_size = 256,
    stride = 128,
    shuffle = True,
    drop_last = True,
    num_workers = 0
):
    # initialize tokenizer
    tokenizer = tiktoken.get_encoding(encoding_name="gpt2")
    
    # create dataset
    dataset = GPTDatasetV1(raw_text=raw_text, tokenizer=tokenizer, context_size=context_size, stride=stride)
    
    # create dataloader
    dataloader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    
    return dataloader

In [38]:
dataloader = create_dataloader_v1(raw_text=raw_text, batch_size=4, context_size=256, stride=128,shuffle=False, drop_last=True)

data_iter = iter(dataloader)
inputs,targets = next(data_iter)

In [39]:
print(f"Inputs:\n {inputs}")
print(f"Targets:\n {targets}")

Inputs:
 tensor([[18308, 14179,   290,  ...,    13,   198, 25396],
        [ 1588, 49303,    13,  ...,  1010,  1497,    26],
        [  353,   373,  9074,  ..., 10381,  1613,   262],
        [  484,  1422,   470,  ...,   257,  1218,    11]])
Targets:
 tensor([[14179,   290,   262,  ...,   198, 25396,   353],
        [49303,    13,  9074,  ...,  1497,    26,   484],
        [  373,  9074,    13,  ...,  1613,   262,  4324],
        [ 1422,   470,   765,  ...,  1218,    11,  1770]])
