In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
import tiktoken


In [2]:
vocab_size=50257
output_dim=256

In [3]:
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [4]:
token_embedding_layer.weight.shape

torch.Size([50257, 256])

In [5]:
tokenizer = tiktoken.get_encoding("gpt2")
with open ("C:\\Users\\rouna\\Desktop\\Development\\llm_from_scratch\\Data\\the-verdict.txt", "r") as file:
    raw_text = file.read()

In [6]:
class GPTDaatasetV1 (Dataset):
    
    def __init__(self, txt, tokenizer, max_length,stride):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.stride = stride
        self.token_ids = tokenizer.encode(txt,allowed_special={"<|endoftext|>"})
        
        self.inputs = []
        self.targets = []
        
        for i in range(0, len(self.token_ids) - max_length, stride):
            input_chunks = self.token_ids[i:i + max_length]
            target_chunks = self.token_ids[i + 1:i + max_length + 1]
            self.inputs.append(torch.tensor(input_chunks))
            self.targets.append(torch.tensor(target_chunks))

    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]
    
def create_dataloader_V1 (txt, max_length=256, stride=128, batch_size=4,shuffle=True,drop_last=True,num_workers=0):

    tokenizer = tiktoken.get_encoding("gpt2")

    dataset = GPTDaatasetV1(txt, tokenizer, max_length, stride)

    dataloader= DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last,num_workers=num_workers)

    return dataloader

In [7]:
max_length=4
dataloader=create_dataloader_V1(raw_text, batch_size=8, max_length=max_length,stride=max_length,shuffle=False)
data_iter=iter(dataloader)
input_ids, target_ids=next(data_iter)
print("input_ids:", input_ids)
print("\ntarget_ids:", target_ids)

input_ids: tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])

target_ids: tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287]])


In [8]:
token_embeddings=token_embedding_layer(input_ids)

print("\ntoken_embeddings.shape:", token_embeddings.shape)


token_embeddings.shape: torch.Size([8, 4, 256])


In [9]:
context_length = max_length
pos_embeddings_layer = torch.nn.Embedding(context_length, output_dim) 

In [10]:
pos_embeddings=pos_embeddings_layer(torch.arange(context_length))

In [None]:
pos_embeddings.shape

torch.Size([4, 256])

: 