# Embedding

Add 1 nn layer

In [16]:
import torch

torch.manual_seed(123)

vocab_size = 6
output_dim = 3
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

print("embedding_layer.shape: ", embedding_layer.weight.shape)
print("embedding_layer.weight: ", embedding_layer.weight)

embedding_layer.shape:  torch.Size([6, 3])
embedding_layer.weight:  Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [17]:
input_ids = torch.tensor([2, 3, 5, 1])

print("embedding_layer(input_ids).shape: ", embedding_layer(input_ids).shape)
print("embedding_layer(input_ids): ", embedding_layer(input_ids))

embedding_layer(input_ids).shape:  torch.Size([4, 3])
embedding_layer(input_ids):  tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


## Position Encoding

Load text in using the previous code

In [19]:
import os
from dotenv import load_dotenv

load_dotenv()
dataset_dir = os.getenv("DATASET_PATH")
file_path = os.path.join(dataset_dir, "the-verdict.txt")

with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()


import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
enc_text = tokenizer.encode(raw_text)


from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

max_length = 4
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

print("Inputs Token shape:\n", inputs.shape)
print("\nInput Token IDs:\n", inputs)

Inputs Token shape:
 torch.Size([8, 4])

Input Token IDs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])


setup the Token Embedding Layer

In [25]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

print("token_embedding_layer.shape: ", token_embedding_layer.weight.shape)
print("token_embedding_layer.weight: ", token_embedding_layer.weight)

token_embedding_layer.shape:  torch.Size([50257, 256])
token_embedding_layer.weight:  Parameter containing:
tensor([[-0.9669,  0.6712, -0.9405,  ..., -0.5670, -0.2658, -1.1116],
        [ 0.4573, -2.0515, -1.6125,  ..., -0.7737,  0.0753,  0.5233],
        [-0.2805,  0.1652, -0.7465,  ...,  0.2864,  0.1072,  0.3312],
        ...,
        [-0.7341, -1.6871, -0.5628,  ..., -2.3785, -0.3345, -1.3539],
        [-0.5672, -1.8570, -0.7118,  ...,  0.2582,  0.2064, -0.3771],
        [-0.4862, -0.4990,  0.6122,  ...,  0.9440, -0.1298, -0.1704]],
       requires_grad=True)


use the token embedding layer

In [26]:
token_embeddings = token_embedding_layer(inputs)

print("token_embeddings.shape: ", token_embeddings.shape)

token_embeddings.shape:  torch.Size([8, 4, 256])


setup the Positional Embedding Layer

In [27]:
context_length = max_length

pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(max_length))

print("pos_embeddings.shape: ", pos_embeddings.shape)
print("pos_embeddings: ", pos_embeddings)

pos_embeddings.shape:  torch.Size([4, 256])
pos_embeddings:  tensor([[-1.3132,  1.7849,  1.7399,  ...,  2.4390,  0.3164, -0.0337],
        [-0.5747, -1.0442, -1.6610,  ..., -0.4074, -0.7294, -0.0788],
        [ 0.5618, -0.4305,  0.1779,  ..., -0.7375, -0.2083,  0.3547],
        [-0.4516, -0.2100,  1.0609,  ..., -0.7244, -3.0794,  0.6927]],
       grad_fn=<EmbeddingBackward0>)


In [28]:
input_embeddings = token_embeddings + pos_embeddings

print("input_embeddings.shape: ", input_embeddings.shape)

input_embeddings.shape:  torch.Size([8, 4, 256])
