# Embedding Logic using a lookup table -  look table is dictionary

In [1]:
import numpy as np
import torch

In [2]:
# vocab size  - Row
vocab_size = 10

# dimens size - Col
dimension = 5

In [3]:
# create lookup table to hold all the embeddings - (row, col)
lookup_table = torch.randn(vocab_size, dimension, requires_grad = True)
lookup_table

tensor([[-0.4841, -1.3054, -0.2945, -1.7204,  0.4161],
        [-1.1229, -0.8891,  0.8219,  1.1185,  0.0902],
        [ 0.8518, -0.6951,  0.1415,  0.4564, -1.3059],
        [ 0.8701,  1.3547,  0.9424,  0.0304,  0.1960],
        [-2.2807,  1.0329, -1.2156,  0.5004,  0.2888],
        [ 1.5730, -0.1236, -0.7685, -1.4298, -0.2352],
        [-0.4445,  1.3177, -0.2923, -0.6271, -0.0583],
        [-0.7289,  0.2090,  0.6173, -0.4027, -0.4868],
        [-1.4385,  1.1790,  0.1834, -0.0434,  0.1437],
        [ 1.4119, -0.0596, -0.1195, -1.2390, -0.2819]], requires_grad=True)

In [4]:
lookup_table.shape

torch.Size([10, 5])

In [5]:
# token_ids of the tokens
token_ids = torch.tensor([1, 6, 8, 9])

In [6]:
embeddings = lookup_table[token_ids]
embeddings

tensor([[-1.1229, -0.8891,  0.8219,  1.1185,  0.0902],
        [-0.4445,  1.3177, -0.2923, -0.6271, -0.0583],
        [-1.4385,  1.1790,  0.1834, -0.0434,  0.1437],
        [ 1.4119, -0.0596, -0.1195, -1.2390, -0.2819]],
       grad_fn=<IndexBackward0>)

## use tiktoken and generate Embeddings

In [7]:
import tiktoken

In [9]:
# create tokenizer
tokenizer = tiktoken.get_encoding("gpt2")
tokenizer.n_vocab

50257

In [10]:
# create a lookup table
lookup_table = torch.randn(tokenizer.n_vocab, 5, requires_grad = True)
lookup_table

tensor([[ 0.7262, -1.8836,  0.6666, -3.8572, -0.3350],
        [ 1.2309,  0.2757,  0.5897, -0.3518,  0.7307],
        [ 0.1197, -0.0409,  0.6423,  0.2313, -0.3541],
        ...,
        [-1.0223, -0.3180,  2.0840, -0.0123, -1.4385],
        [-0.6999,  0.7726, -1.1392,  0.1516,  0.5691],
        [-1.2993, -1.7773, -0.3010,  1.1979,  0.6932]], requires_grad=True)

In [11]:
lookup_table.shape

torch.Size([50257, 5])

In [13]:
# input 
sentence = "I love machine learning"

# get the tokens for sentence
token_ids = tokenizer.encode(sentence)
token_ids

[40, 1842, 4572, 4673]

In [14]:
#get the embedding for token
embeddings = lookup_table[token_ids]
embeddings

tensor([[ 1.3944, -0.1768,  1.2553,  1.0207, -0.3683],
        [ 0.0256,  0.7404, -0.0541,  1.5718, -1.6003],
        [ 0.1360, -0.1951,  0.1533, -0.1770, -0.3233],
        [ 1.4045,  0.1204, -2.2204,  1.3765,  2.7580]],
       grad_fn=<IndexBackward0>)