In [55]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import tiktoken

Using class and function from previous file

In [56]:
class CreateGPTDatasetV1(Dataset):
    def __init__(self, txt: str, tokenizer: tiktoken.Encoding, context_length: int, stride: int):
        
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})


        # Split into input and target using a sliding window approach
        for i in range(0, len(token_ids)-context_length, stride):

            input_chunk = token_ids[i:i+context_length]
            output_chunk = token_ids[i+1:i+1+context_length]

            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(output_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [57]:
def create_dataloader_v1(txt, batch_size, max_length, 
                         stride, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = CreateGPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

STEP1: Prepare DATA

In [58]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
tokenizer = tiktoken.get_encoding('gpt2')
context_length = 4
stride = 1

In [59]:
dataloader = create_dataloader_v1(txt=raw_text, batch_size=8, max_length=context_length,stride=stride)

In [60]:
batch_1_ip, batch_1_target = next(iter(dataloader))
batch_1_ip.shape

torch.Size([8, 4])

A  minibatch of size = 8, containing a list of two tensors, 1st one is the inputs 1 to 8 and the 2nd one is targets 1 to 8. each row indicates a sequence of context size = 4

STEP2: Create Embeddings

In [None]:
# Create token embeddings
''' vocab size =4, embedding dimension=256 '''

vocab_size = 50257 # for GPT2 this is the vocab size
token_embedding_layer = nn.Embedding(vocab_size, 256)

In [62]:
token_embedding_layer.weight.shape

torch.Size([50257, 256])

In [65]:
# passing the 1st batch through the embeddings layer
token_embeddings = token_embedding_layer(batch_1_ip)
token_embeddings.shape

torch.Size([8, 4, 256])

In [66]:
# Create pos embeddings
'''in pos embeddings context length will be vocab size since we are only concerned with the max no of tokens in the sequence. '''
pos_embedding_layer = nn.Embedding(context_length, 256)
pos_embedding_layer.weight.shape

torch.Size([4, 256])

In [67]:
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
pos_embeddings.shape

torch.Size([4, 256])

In [68]:
final_embeddings = token_embeddings + pos_embeddings
final_embeddings.shape

torch.Size([8, 4, 256])