# Chapter 2. Working with Text Data

In [1]:
import tiktoken
import torch
import util
import os

from torch.utils.data import Dataset, DataLoader

## Encoder Demo

In [2]:
text = util.text_corpus()
tokenizer = tiktoken.get_encoding('gpt2')

In [3]:
text_test = "Hello<|endoftext|> > !!!"

In [4]:
encoded = tokenizer.encode(text_test, allowed_special={'<|endoftext|>'})
print(encoded)

[15496, 50256, 1875, 220, 10185]


## Create Dataset

In [5]:
class GPTDatasetV1(Dataset):
    _input_ids: list[torch.Tensor]
    _target_ids: list[torch.Tensor]

    def __init__(
        self,
        content: str,
        tokenizer: tiktoken.core.Encoding,
        context_window_size: int,
        stride: int
    ):
        self._input_ids = []
        self._target_ids = []

        token_ids = tokenizer.encode(content)

        for i in range(0, len(token_ids) - context_window_size, stride):
            input_chunk = token_ids[i:i + context_window_size]
            target_chunk = token_ids[i + 1: i + 1 + context_window_size]
            self._input_ids.append(torch.tensor(input_chunk))
            self._target_ids.append(torch.tensor(target_chunk))

    def __len__(self) -> int:
        return len(self._input_ids)

    def __getitem__(self, idx: int) -> (torch.Tensor, torch.Tensor):
        return self._input_ids[idx], self._target_ids[idx]

In [6]:
def create_dataloader_v1(
    content: str,
    batch_size: int,
    context_window: int, 
    stride: int = 1,
    shuffle: bool = True,
    drop_last: bool = True,
    num_workers: int = 0,
) -> DataLoader:
    tokenizer = tiktoken.get_encoding('gpt2')
    dataset = GPTDatasetV1(content, tokenizer, context_window, stride)
    return DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers,
    )

In [7]:
context_window_size = 256
data_loader = create_dataloader_v1(text, batch_size=8, context_window=context_window_size, stride=128)
data_iter = iter(data_loader)

In [8]:
next(data_iter)

[tensor([[ 1555,   262,  1306,  ...,  6846,    11,   314],
         [ 1276,   751,    11,  ...,   198,     1,  3666],
         [ 1422,   470,   438,  ...,   271, 10899,   550],
         ...,
         [ 4562,    11,  3181,  ...,   673,   550,   407],
         [   12, 12239,    11,  ...,   965,  1397,    11],
         [12917,   905,    11,  ...,   550,  1813,   510]]),
 tensor([[  262,  1306,  1110,  ...,    11,   314,  1276],
         [  751,    11,   339,  ...,     1,  3666, 13674],
         [  470,   438,    83,  ..., 10899,   550,   257],
         ...,
         [   11,  3181,   503,  ...,   550,   407, 17901],
         [12239,    11,   475,  ...,  1397,    11,   326],
         [  905,    11,  5025,  ...,  1813,   510,   465]])]

In [9]:
x, y = next(data_iter)
print((x.shape, y.shape))

(torch.Size([8, 256]), torch.Size([8, 256]))


## Embedding Demo

In [10]:
demo_vocab_size: int = 6
demo_output_dim = 3

In [11]:
demo_embedding_layer = torch.nn.Embedding(demo_vocab_size, demo_output_dim)
print(demo_embedding_layer.weight)

Parameter containing:
tensor([[-0.5471, -0.2956,  0.1631],
        [-0.8480,  1.5287,  0.1057],
        [-0.1845, -0.1617,  1.1812],
        [-0.5664,  1.1810, -0.2273],
        [-1.4231, -0.3947,  1.6258],
        [-0.6573,  1.3287,  1.8565]], requires_grad=True)


In [12]:
# The result should be a simple lookup.
demo_embedding_layer(torch.tensor([0, 2, 3]))

tensor([[-0.5471, -0.2956,  0.1631],
        [-0.1845, -0.1617,  1.1812],
        [-0.5664,  1.1810, -0.2273]], grad_fn=<EmbeddingBackward0>)

## Token Embedding

In [13]:
vocab_size = tokenizer.max_token_value + 1
output_dim = 512
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [14]:
batch_x_embedding = token_embedding_layer(next(data_iter)[0])
batch_x_embedding.shape

torch.Size([8, 256, 512])

## Position Embedding

In [15]:
position_embedding_layer = torch.nn.Embedding(context_window_size, output_dim)

In [16]:
batch_position_embedding = position_embedding_layer(torch.arange(context_window_size).unsqueeze(0))
batch_position_embedding.shape

torch.Size([1, 256, 512])

## Combine Token and Position Embedding

In [17]:
input_embedding = batch_x_embedding + batch_position_embedding
input_embedding.shape

torch.Size([8, 256, 512])