In [1]:
import torch

In [2]:
torch.manual_seed(123)
vocab_size= 6
output_dim=3
embeddings = torch.nn.Embedding(vocab_size, output_dim)
print(embeddings.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [3]:
print(type(embeddings))

<class 'torch.nn.modules.sparse.Embedding'>


In [4]:
# embeddings are essentially a big lookup
embeddings(torch.tensor([3])) # this returns the 4th row of the above embedding.

tensor([[-0.4015,  0.9666, -1.1481]], grad_fn=<EmbeddingBackward0>)

In [5]:
input_ids = torch.tensor([2,3,5,1])
print(embeddings(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


In [6]:
# going over a more realistic example
vocab_size = 50257
output_dim = 256
token_embeddings_layer= torch.nn.Embedding(vocab_size, output_dim)

In [7]:
from torch.utils.data import Dataset, DataLoader
import tiktoken

In [8]:
class GPTDataset(Dataset):
    def __init__(self, raw_text, tokeniser, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        tokens = tokeniser.encode(raw_text)
        for i in range(0, len(tokens) - max_length, stride):
            inputs = tokens[i: i +max_length]
            targets = tokens[i+1: i+max_length+1]
            self.input_ids.append(torch.tensor(inputs))
            self.target_ids.append(torch.tensor(targets))
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


    def __len__(self):
        return len(self.input_ids)


def create_dataloader(raw_text, tokeniser, max_length, stride, 
                      batch_size, shuffle=False, drop_last=True):
    dataset = GPTDataset(raw_text, tokeniser, max_length, stride)
    dataloader = DataLoader(dataset,batch_size=batch_size, shuffle= shuffle, drop_last= drop_last)
    return dataloader

In [9]:
with open("../resources/verdict.txt") as f:
    raw_text=f.read()

print(len(raw_text))

20559


In [10]:
tokeniser = tiktoken.get_encoding("gpt2")
max_length = 4
batch_size =8
dataloader = create_dataloader(raw_text, tokeniser,max_length, 
                               stride= max_length, batch_size=batch_size)

In [11]:
dataloader_iter= iter(dataloader)
print(next(dataloader_iter))

[tensor([[  464,  4643, 11600,    25],
        [ 1717,   342,   854, 41328],
        [   25, 40417,   198,  3109],
        [ 9213,   422, 11145,   271],
        [ 1668,   319,  3267,  2310],
        [   11, 48609,   198,   198],
        [   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271]]), tensor([[ 4643, 11600,    25,  1717],
        [  342,   854, 41328,    25],
        [40417,   198,  3109,  9213],
        [  422, 11145,   271,  1668],
        [  319,  3267,  2310,    11],
        [48609,   198,   198,    40],
        [  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899]])]


In [12]:
inputs, targets = next(dataloader_iter)

In [13]:
print("inputs: ", inputs, "\n")
print("targets: ", targets)

inputs:  tensor([[10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  922,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11],
        [  287,   262,  6001,   286],
        [  465, 13476,    11,   339]]) 

targets:  tensor([[ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   922],
        [ 5891,  1576,   438,   568],
        [  340,   373,   645,  1049],
        [ 5975,   284,   502,   284],
        [ 3285,   326,    11,   287],
        [  262,  6001,   286,   465],
        [13476,    11,   339,   550]])


In [14]:
print("inputs shape: ", inputs.shape)

inputs shape:  torch.Size([8, 4])


In [15]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embedding = pos_embedding_layer(torch.arange(context_length))

In [16]:
print(pos_embedding.shape)

torch.Size([4, 256])


In [17]:
token_embeddings = token_embeddings_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [18]:
input_embeddings = token_embeddings + pos_embedding

In [19]:
print(input_embeddings.shape)

torch.Size([8, 4, 256])


In [20]:
# now just to test if I can add the axis and get same result
pos_embedding_dim_correction = pos_embedding.unsqueeze(dim=0)

In [21]:
print(pos_embedding_dim_correction.shape)

torch.Size([1, 4, 256])


In [22]:
input_embeddings = token_embeddings + pos_embedding_dim_correction

In [23]:
print(input_embeddings.shape)

torch.Size([8, 4, 256])
