# Embedding using torch

### import required pkg

In [1]:
import numpy as np
import torch
import tiktoken
from torch.utils.data import Dataset, DataLoader

In [2]:
with open("data.txt", "r")  as file:
    raw_text = file.read()

In [3]:
raw_text[:10]

'Alice was '

In [4]:
class DreamLLMDataset(Dataset):
    def __init__(self, raw_text, tokenizer, context_length=10, stride=1):      

        # create the tokens
        tokens = tokenizer.encode(raw_text)

        # store the input and target tensors
        self.input_ids = []
        self.target_ids = []

        # create the input and output tensors 
        for i in range(0, len(tokens) - context_length, stride):
            input_tokens = tokens[i: i + context_length]
            target_tokens = tokens[i+1: i + context_length + 1]

            self.input_ids.append(torch.tensor(input_tokens))
            self.target_ids.append(torch.tensor(target_tokens))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]

In [5]:
#create the tokenizer
tokenizer = tiktoken.get_encoding("gpt2")

In [6]:
# create the data set
dataset = DreamLLMDataset(raw_text = raw_text, tokenizer = tokenizer)

# create the data loader
data_loader = DataLoader(dataset, batch_size=5)

In [7]:
#get the iterator
data_loader_iterator = iter(data_loader)

# get the first batch
inputs, targets = next(data_loader_iterator)

In [8]:
inputs, targets

(tensor([[44484,   373,  3726,   284,   651,   845, 10032,   286,  5586,   416],
         [  373,  3726,   284,   651,   845, 10032,   286,  5586,   416,   607],
         [ 3726,   284,   651,   845, 10032,   286,  5586,   416,   607,  6621],
         [  284,   651,   845, 10032,   286,  5586,   416,   607,  6621,   198],
         [  651,   845, 10032,   286,  5586,   416,   607,  6621,   198,   261]]),
 tensor([[  373,  3726,   284,   651,   845, 10032,   286,  5586,   416,   607],
         [ 3726,   284,   651,   845, 10032,   286,  5586,   416,   607,  6621],
         [  284,   651,   845, 10032,   286,  5586,   416,   607,  6621,   198],
         [  651,   845, 10032,   286,  5586,   416,   607,  6621,   198,   261],
         [  845, 10032,   286,  5586,   416,   607,  6621,   198,   261,   262]]))

## create an embedding layer using pytorch

In [22]:
# get the vocab_size
vocab_size = tokenizer.n_vocab
vocab_size

50257

In [23]:
# declare the embedding dimension
dimensions = 5

In [24]:
# create the embedding layer 
embedding_layer = torch.nn.Embedding(vocab_size, dimensions)
embedding_layer.weight

Parameter containing:
tensor([[-1.6524,  0.5827,  0.2203,  0.0866,  0.4616],
        [-0.3565, -0.8197, -0.0359, -0.5595,  0.9661],
        [ 1.0778,  2.2309,  1.0436, -0.1034, -1.4463],
        ...,
        [-0.0164,  0.0498, -1.4630, -0.2073,  0.2761],
        [-1.1461, -0.1843,  0.9202,  0.0232, -0.4972],
        [ 0.4729, -1.7133,  0.0961, -0.5926,  1.3687]], requires_grad=True)

In [25]:
# get the embeddings of input
input_embeddings = embedding_layer(inputs)
input_embeddings

tensor([[[ 1.8229, -0.3904,  0.1670, -1.6235,  1.2489],
         [ 1.1301, -0.5418,  1.0714,  1.6421, -0.1737],
         [-1.1841,  0.1286,  1.0626,  0.9894,  0.0925],
         [-0.4297,  0.1926, -0.1690,  0.6104,  0.0106],
         [-2.1916,  1.2459, -0.9955, -0.7046,  0.2778],
         [-2.6952, -0.4334,  0.0495, -0.0684,  0.7257],
         [-0.1990, -1.0723,  0.9373, -0.1661,  0.2087],
         [ 0.4891,  0.6249, -0.9196,  1.1644, -0.5797],
         [ 0.5255,  0.6536,  2.3863, -1.9749,  0.4202],
         [ 0.8682, -1.4933, -0.3441, -0.0957, -0.5528]],

        [[ 1.1301, -0.5418,  1.0714,  1.6421, -0.1737],
         [-1.1841,  0.1286,  1.0626,  0.9894,  0.0925],
         [-0.4297,  0.1926, -0.1690,  0.6104,  0.0106],
         [-2.1916,  1.2459, -0.9955, -0.7046,  0.2778],
         [-2.6952, -0.4334,  0.0495, -0.0684,  0.7257],
         [-0.1990, -1.0723,  0.9373, -0.1661,  0.2087],
         [ 0.4891,  0.6249, -0.9196,  1.1644, -0.5797],
         [ 0.5255,  0.6536,  2.3863, -1.9749, 

In [26]:
input_embeddings.shape

torch.Size([5, 10, 5])