In [1]:
with open ("C:\\Users\\rouna\\Desktop\\Development\\llm_from_scratch\\Data\\the-verdict.txt", "r") as file:
    raw_text = file.read()

In [2]:
import tiktoken

In [3]:
tokenizer = tiktoken.get_encoding("gpt2")

In [4]:
token_ids = tokenizer.encode(raw_text)
print(f"Total number of tokens: {len(token_ids)}")

Total number of tokens: 5145


In [5]:
context_size = 4

x=token_ids[:context_size]
y=token_ids[1:context_size+1]


print("Input token ids: ", x)
print("Target token ids:", y)

Input token ids:  [40, 367, 2885, 1464]
Target token ids: [367, 2885, 1464, 1807]


In [6]:
for i in range(1,context_size+1):
    context=token_ids[:i]
    desired=token_ids[i]
    print(context, "---->", desired)

[40] ----> 367
[40, 367] ----> 2885
[40, 367, 2885] ----> 1464
[40, 367, 2885, 1464] ----> 1807


In [7]:
for i in range(1,context_size+1):
    context=token_ids[:i]
    desired=token_ids[i]
    print(context, "---->", desired)

[40] ----> 367
[40, 367] ----> 2885
[40, 367, 2885] ----> 1464
[40, 367, 2885, 1464] ----> 1807


In [8]:
for i in range(1, context_size + 1):
    context = token_ids[:i]
    desired = token_ids[i]
    context_text = tokenizer.decode(context)
    desired_text = tokenizer.decode([desired])
    print(f"{context} ({context_text!r}) ---> {desired} ({desired_text!r})")

[40] ('I') ---> 367 (' H')
[40, 367] ('I H') ---> 2885 ('AD')
[40, 367, 2885] ('I HAD') ---> 1464 (' always')
[40, 367, 2885, 1464] ('I HAD always') ---> 1807 (' thought')


# Data Loader Using PyTorch 

In [16]:
from torch.utils.data import DataLoader, Dataset
import torch

In [17]:
class GPTDaatasetV1 (Dataset):
    
    def __init__(self, txt, tokenizer, max_length,stride):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.stride = stride
        self.token_ids = tokenizer.encode(txt,allowed_special={"<|endoftext|>"})
        
        self.inputs = []
        self.targets = []
        
        for i in range(0, len(self.token_ids) - max_length, stride):
            input_chunks = self.token_ids[i:i + max_length]
            target_chunks = self.token_ids[i + 1:i + max_length + 1]
            self.inputs.append(torch.tensor(input_chunks))
            self.targets.append(torch.tensor(target_chunks))

    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

In [19]:
def create_dataloader_V1 (txt, max_length=256, stride=128, batch_size=4,shuffle=True,drop_last=True,num_workers=0):

    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = GPTDaatasetV1(txt, tokenizer, max_length, stride)

    dataloader= DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last,num_workers=num_workers)

    return dataloader

In [20]:
raw_text

'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)\n\n"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it\'s going to send the value of my picture \'way up; but I don\'t think of that, Mr. Rickham--the loss to Arrt is all I think of." The word, on Mrs. Thwing\'s lips, multiplied its _rs_ as though they were reflected in an endless vista of mirrors. And it was not only the Mrs. Thwings who mourned. Had not the exquisite Hermia Croft, at the last Grafton Gallery show, stopped me before Gisburn\'s "Moon-dancers" to say, with tears in her eyes: "We shall not look upon its like again"?\n\nWell!--even 

In [21]:
dataloader=create_dataloader_V1(raw_text, batch_size=1,max_length=4,stride=1,shuffle=False)

In [25]:
data_iter = iter(dataloader)
first_batch = next(data_iter)

In [26]:
print("Input batch:", first_batch)

Input batch: [tensor([[  40,  367, 2885, 1464]]), tensor([[ 367, 2885, 1464, 1807]])]


In [44]:
dataloader.dataset.inputs[1], dataloader.dataset.targets[0]

(tensor([ 367, 2885, 1464, 1807]), tensor([ 367, 2885, 1464, 1807]))

In [None]:
# # ...existing code...
# import torch
# # show full tensors instead of truncating
# try:
#     torch.set_printoptions(profile="full")
# except Exception:
#     torch.set_printoptions(threshold=10**6)

# # ensure dataloader was created with drop_last=False if you want last partial batch:
# # dataloader = create_dataloader_V1(raw_text, batch_size=1, max_length=4, stride=1, shuffle=False, drop_last=False)

# for batch_idx, (inputs, targets) in enumerate(dataloader):
#     print(f"Batch {batch_idx}: inputs.shape={inputs.shape}, targets.shape={targets.shape}, dtype={inputs.dtype}")
#     # inputs, targets are tensors of shape (batch_size, seq_len)
#     for sample_idx in range(inputs.size(0)):
#         inp_ids = inputs[sample_idx].tolist()
#         tgt_ids = targets[sample_idx].tolist()
#         inp_text = tokenizer.decode(inp_ids)
#         tgt_text = tokenizer.decode(tgt_ids)
#         print(f"  Sample {sample_idx}:")
#         print(f"    input ids : {inp_ids}")
#         print(f"    target ids: {tgt_ids}")
#         print(f"    input text: {inp_text!r}")
#         print(f"    target text: {tgt_text!r}")
#     print("-" * 80)

Batch 0: inputs.shape=torch.Size([1, 4]), targets.shape=torch.Size([1, 4]), dtype=torch.int64
  Sample 0:
    input ids : [40, 367, 2885, 1464]
    target ids: [367, 2885, 1464, 1807]
    input text: 'I HAD always'
    target text: ' HAD always thought'
--------------------------------------------------------------------------------
Batch 1: inputs.shape=torch.Size([1, 4]), targets.shape=torch.Size([1, 4]), dtype=torch.int64
  Sample 0:
    input ids : [367, 2885, 1464, 1807]
    target ids: [2885, 1464, 1807, 3619]
    input text: ' HAD always thought'
    target text: 'AD always thought Jack'
--------------------------------------------------------------------------------
Batch 2: inputs.shape=torch.Size([1, 4]), targets.shape=torch.Size([1, 4]), dtype=torch.int64
  Sample 0:
    input ids : [2885, 1464, 1807, 3619]
    target ids: [1464, 1807, 3619, 402]
    input text: 'AD always thought Jack'
    target text: ' always thought Jack G'
-----------------------------------------------