In [1]:
!pip install tiktoken



In [5]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")

In [6]:
raw_text = """
    LLMs evolved from earlier statistical and recurrent 
    neural network approaches to language modeling.
    The transformer architecture, introduced in 2017,
    replaced recurrence with self-attention, allowing efficient parallelization,
    longer context handling, and scalable training on unprecedented data volumes.
    This innovation enabled models like GPT, BERT, and their successors,
    which demonstrated emergent behaviors at scale, such as few-shot 
    learning and compositional reasoning
"""

In [7]:
enc_text = tokenizer.encode(raw_text)

print(len(enc_text))

117


In [11]:
enc_sample = enc_text[50:]

In [13]:
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:       {y}")

x: [6942, 10730, 1634, 11]
y:       [10730, 1634, 11, 198]


In [14]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(context, "---->", desired)

[6942] ----> 10730
[6942, 10730] ----> 1634
[6942, 10730, 1634] ----> 11
[6942, 10730, 1634, 11] ----> 198


In [16]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))

 efficient ---->  parallel
 efficient parallel ----> ization
 efficient parallelization ----> ,
 efficient parallelization, ----> 



 and ---->  established
 and established ---->  himself
 and established himself ---->  in
 and established himself in ---->  a

In [17]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [18]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [26]:
import os
print(os.getcwd())

/content


In [27]:
# Try adding the folder name to the path
with open("Lecture_9/the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

FileNotFoundError: [Errno 2] No such file or directory: 'Lecture_9/the-verdict.txt'

In [28]:
raw_text = """
    LLMs evolved from earlier statistical and recurrent 
    neural network approaches to language modeling.
    The transformer architecture, introduced in 2017,
    replaced recurrence with self-attention, allowing efficient parallelization,
    longer context handling, and scalable training on unprecedented data volumes.
    This innovation enabled models like GPT, BERT, and their successors,
    which demonstrated emergent behaviors at scale, such as few-shot 
    learning and compositional reasoning.
    Substantial infrastructure is necessary for training the largest models.
    The tendency towards larger models is visible in the list of large language models.
    For example, the training of GPT-2 (i.e. a 1.5-billion-parameters model)
    in 2019 cost $50,000, while training of the PaLM (i.e. a 540-billion-parameters model) 
    in 2022 cost $8 million, and Megatron-Turing NLG 530B (in 2021) cost around $11 million.
    The qualifier "large" in "large language model" is inherently vague, as there is no
    definitive threshold for the number of parameters required to qualify as "large".
    GPT-1 of 2018 has 117 million parameters.
"""

In [29]:
import torch
print("PyTorch version:", torch.__version__)
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

PyTorch version: 2.9.0+cpu
[tensor([[198, 220, 220, 220]]), tensor([[  220,   220,   220, 27140]])]


In [30]:
second_batch = next(data_iter)
print(second_batch)

[tensor([[  220,   220,   220, 27140]]), tensor([[  220,   220, 27140, 10128]])]


In [31]:
dataloader = create_dataloader_v1(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[  198,   220,   220,   220],
        [27140, 10128, 12572,   422],
        [ 2961, 13905,   290, 42465],
        [  220,   198,   220,   220],
        [  220, 17019,  3127, 10581],
        [  284,  3303, 21128,    13],
        [  198,   220,   220,   220],
        [  383, 47385, 10959,    11]])

Targets:
 tensor([[  220,   220,   220, 27140],
        [10128, 12572,   422,  2961],
        [13905,   290, 42465,   220],
        [  198,   220,   220,   220],
        [17019,  3127, 10581,   284],
        [ 3303, 21128,    13,   198],
        [  220,   220,   220,   383],
        [47385, 10959,    11,  5495]])
