In [1]:
import tiktoken

In [2]:
with open("../resources/verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

In [3]:
tokeniser = tiktoken.get_encoding("gpt2")

In [4]:
enc_token = tokeniser.encode(raw_text)
print(len(enc_token))

5170


In [5]:
enc_sample = enc_token[:30]

In [6]:
context_length = 4
input_tokens = enc_sample[0:context_length]
target_tokens = enc_sample[1:context_length+1]
print(input_tokens)
print(target_tokens)

[464, 4643, 11600, 25]
[4643, 11600, 25, 1717]


In [7]:
# note here the we cannot iterate from 0 because that would mean enc_sample[:0] returns as empty array 
# thus we need to increase the stop range by 1, so we can show the input of len 4  
for i in range(1,context_length+1):
    context = enc_sample[:i]
    desired= enc_sample[i]
    print(context, "-->", desired)
    
    

[464] --> 4643
[464, 4643] --> 11600
[464, 4643, 11600] --> 25
[464, 4643, 11600, 25] --> 1717


In [8]:
for i in range(1,context_length+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    # note how context is not in a list while desired is and thats because context returns
    # a list of token ids while the desired object simply returns a value.
    print(tokeniser.decode(context), "-->", tokeniser.decode([desired]))

The -->  Ver
The Ver --> dict
The Verdict --> :
The Verdict: -->  Ed


In [9]:
from torch.utils.data import Dataset, DataLoader
import torch

In [10]:
# before we jump into the next section lets revisit a for loop with steps
for i in range(0, 11, 3):
    print(i)

# you will notice here, that at each iteration we start with the integer at the <start> in this case
# its 0, then we count the number of <steps> after the start number, so for <steps> 3,
# we count 123 and return the last index in this case its 3 and then 6 and so on. 
# this logic is important to know as we look at the dataset class we define below.

0
3
6
9


In [11]:
class GPTDatasetV1(Dataset):
    def __init__(self,text, tokeniser: tiktoken.Encoding, max_seq_length, stride):
        # the initialisation needs input and target ids in order to save and get them later once we
        # have built a logic to define them, we should be able to assign them.
        self.input_ids = []
        self.target_ids = []
        token_ids = tokeniser.encode(text, allowed_special={"<|endoftext|>"})

        # the max_seq_length is subtracted from the total length of the text since we want to make sure
        # we have the same length vector available at the end of the iteration.
        # the stride is an important parameter here, in the absence of it we would iterate by 1, 
        # however, we need to ensure that strides are so that they dont overlap between input and target
        # in cases that they do we understand that it having it equal to max_seq_length ensures 
        # that there is no overlap.
        for i in range(0, len(token_ids) - max_seq_length, stride):
            inputs = token_ids[i:i+max_seq_length]
            targets = token_ids[i+1: i+max_seq_length+1]
            self.input_ids.append(torch.tensor(inputs))
            self.target_ids.append(torch.tensor(targets))

    def __getitem__(self,idx):
        return self.input_ids[idx], self.target_ids[idx]

    def __len__(self):
        return len(self.input_ids)


In [12]:
def create_dataloader(text, max_seq_length, stride ,batch_size, shuffle, drop_last, num_workers):
    tokeniser = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(text, tokeniser, max_seq_length, stride)
    dataloader = DataLoader(dataset= dataset, 
                            batch_size= batch_size,
                            shuffle= shuffle, 
                            drop_last= drop_last,
                            num_workers= num_workers)
    return dataloader

In [13]:
# we will now use the dataloader and see an example using the verdict.txt dataset

dataloader = create_dataloader(raw_text, max_seq_length= 4, stride=1,batch_size=1, shuffle=False,
                              drop_last=True, num_workers=0)

In [14]:
print(type(dataloader))

<class 'torch.utils.data.dataloader.DataLoader'>


In [15]:
# lets convert it to an iterator so we can inspect the components of the loader
# an object that implements and __iter__ method can be converted to an iterator
iter(dataloader)

<torch.utils.data.dataloader._SingleProcessDataLoaderIter at 0x114d12e90>

In [16]:
# Let's take a look at the first and second iterations of the DataLoader.
# In the first batch, we get a list of integers corresponding to the input and target sequences.
# When we retrieve the second batch, we see that it consists of the same data but slightly shifted, 
# as determined by the stride parameter.

# Key observations:
# 1. The total number of batches is determined by the dataset size, `max_seq_length`, and the stride.
# 2. Each batch contains a fixed number of elements, defined by `max_seq_length` (in this case, 4).
# 3. The stride of 1 means that consecutive batches have overlapping sequences, with each new batch 
#    shifted by 1 position compared to the previous batch.

# Example:
data_iter = iter(dataloader)
first_batch = next(data_iter)
print("First batch:", first_batch, "\n")

second_batch = next(data_iter)
print("Second batch:", second_batch)

First batch: [tensor([[  464,  4643, 11600,    25]]), tensor([[ 4643, 11600,    25,  1717]])] 

Second batch: [tensor([[ 4643, 11600,    25,  1717]]), tensor([[11600,    25,  1717,   342]])]


In [17]:
# lets try stride 2 and 3
stride2 = create_dataloader(raw_text, max_seq_length= 4, stride=2,batch_size=1, shuffle=False,
                              drop_last=True, num_workers=0)

stride3 = create_dataloader(raw_text, max_seq_length= 4, stride=3,batch_size=1, shuffle=False,
                              drop_last=True, num_workers=0)

In [18]:
iter_stride2 = iter(stride2)
print(next(iter_stride2))
print(next(iter_stride2))

[tensor([[  464,  4643, 11600,    25]]), tensor([[ 4643, 11600,    25,  1717]])]
[tensor([[11600,    25,  1717,   342]]), tensor([[  25, 1717,  342,  854]])]


In [19]:
iter_stride3 = iter(stride3)
print(next(iter_stride3))
print(next(iter_stride3))

[tensor([[  464,  4643, 11600,    25]]), tensor([[ 4643, 11600,    25,  1717]])]
[tensor([[  25, 1717,  342,  854]]), tensor([[ 1717,   342,   854, 41328]])]


In [20]:
# lets also look at a larger batch size
large_batch= create_dataloader(raw_text, max_seq_length= 4, stride=2,batch_size=4, shuffle=False,
                              drop_last=True, num_workers=0)

In [21]:
batch_iter = iter(large_batch)
print(next(batch_iter))

[tensor([[  464,  4643, 11600,    25],
        [11600,    25,  1717,   342],
        [ 1717,   342,   854, 41328],
        [  854, 41328,    25, 40417]]), tensor([[ 4643, 11600,    25,  1717],
        [   25,  1717,   342,   854],
        [  342,   854, 41328,    25],
        [41328,    25, 40417,   198]])]


In [22]:
print(next(batch_iter))

[tensor([[   25, 40417,   198,  3109],
        [  198,  3109,  9213,   422],
        [ 9213,   422, 11145,   271],
        [11145,   271,  1668,   319]]), tensor([[40417,   198,  3109,  9213],
        [ 3109,  9213,   422, 11145],
        [  422, 11145,   271,  1668],
        [  271,  1668,   319,  3267]])]
