#   Tokenization

## STEP 1 : Creating Tokens
##### Here we split the text file into individual words

In [52]:
with open("gita.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
    
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 137483

The Project Gutenberg EBook of The Bhagavad-Gita, by Anonymous

This eBook is for the use of anyon


##### Using "re" python function


In [53]:
import re

text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)

print(result)

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test.']


In [54]:
# Remove white space as tokens
result = re.split(r'([,.]|\s)', text)

print(result) # White space removal depends on requirements, eg, for py code its imoportant to keep white space, we are removing memory req

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ',', '', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [55]:
# Now we not only want comma, and fullystop but all kind of thing in the text as tokens for we do,
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])
print(len(preprocessed))

['The', 'Project', 'Gutenberg', 'EBook', 'of', 'The', 'Bhagavad-Gita', ',', 'by', 'Anonymous', 'This', 'eBook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', '.', 'You']
29369


## Convert the tokens into token IDs

In [56]:
# Now we'll create a list of unique tokens and sort them alphabatically
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

5181


In [57]:
vocab = {token:integer for integer,token in enumerate(all_words)}


In [58]:
# After sorting them now we will create token IDs
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break


('!', 0)
('"', 1)
('#2388]', 2)
('$1', 3)
('$5', 4)
('&', 5)
('&c', 6)
("'", 7)
('(', 8)
(')', 9)
('***', 10)
('*****', 11)
(',', 12)
('-', 13)
('--', 14)
('-all', 15)
('-grandsires', 16)
('-nay', 17)
('.', 18)
('//www', 19)
('000', 20)
('1', 21)
('1500', 22)
('1900', 23)
('2', 24)
('20%', 25)
('2000', 26)
('2001', 27)
('2013', 28)
('23', 29)
('2388-h', 30)
('26', 31)
('3', 32)
('30', 33)
('4', 34)
('4557', 35)
('5', 36)
('50', 37)
('501', 38)
('596-1887', 39)
('6', 40)
('60', 41)
('64-6221541', 42)
('67', 43)
('7', 44)
('8', 45)
('801', 46)
('809', 47)
('84116', 48)
('9', 49)
('90', 50)


# STEP 2 : Now lets create a class for encoder and decoder
##### Basically encoder is what we did abhi tak, ie. creating token IDs from a text file, in docoder we do ulta, ie. creating textfile
##### from the token IDs

In [59]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [60]:
tokenizer = SimpleTokenizerV1(vocab)

text = """"Then, at the signal of the aged king,
With blare to wake the blood, rolling around"""
ids = tokenizer.encode(text)
print(ids)

[1, 1403, 12, 1906, 4678, 4372, 3673, 4678, 1793, 3278, 12, 1618, 2017, 4739, 4972, 4678, 2037, 12, 4168, 1880]


In [61]:
tokenizer.decode(ids)

'" Then, at the signal of the aged king, With blare to wake the blood, rolling around'

# STEP 3 :SPECIAL CONTEXT TOKENS
##### Cool now u have created a class for tokenizing & De-tokenizing the word, but what if theres a word whose token is not present, lets deal with that

In [62]:
# Modify the tokenizer to handle unknown words
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"]) # add 2 more entries in the list

vocab = {token:integer for integer, token in enumerate(all_tokens)} # Recreating the vocab

In [63]:
len(vocab.items()) # size inc by 2 tokens

5183

In [64]:
# Lets quick check last 5 entries
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('your', 5178)
('youth', 5179)
('zip', 5180)
('<|endoftext|>', 5181)
('<|unk|>', 5182)


In [65]:
# Nows lets add the <unk> part into the class

class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [66]:
# Finally lets look at our token work\
tokenizer = SimpleTokenizerV2(vocab)

for tokenizer, item in enumerate(vocab.items()):
    print(item)
    if tokenizer >= 2000:
        break


('!', 0)
('"', 1)
('#2388]', 2)
('$1', 3)
('$5', 4)
('&', 5)
('&c', 6)
("'", 7)
('(', 8)
(')', 9)
('***', 10)
('*****', 11)
(',', 12)
('-', 13)
('--', 14)
('-all', 15)
('-grandsires', 16)
('-nay', 17)
('.', 18)
('//www', 19)
('000', 20)
('1', 21)
('1500', 22)
('1900', 23)
('2', 24)
('20%', 25)
('2000', 26)
('2001', 27)
('2013', 28)
('23', 29)
('2388-h', 30)
('26', 31)
('3', 32)
('30', 33)
('4', 34)
('4557', 35)
('5', 36)
('50', 37)
('501', 38)
('596-1887', 39)
('6', 40)
('60', 41)
('64-6221541', 42)
('67', 43)
('7', 44)
('8', 45)
('801', 46)
('809', 47)
('84116', 48)
('9', 49)
('90', 50)
('99712', 51)
(':', 52)
(';', 53)
('?', 54)
('A', 55)
('ACTUAL', 56)
('ADHIBHUTA', 57)
('ADHIDAIVA', 58)
('ADHIYAJNA', 59)
('ADHYATMAN', 60)
('AGREE', 61)
('AGREEMENT', 62)
('AK', 63)
('AKSHARAM', 64)
('AND', 65)
('ANY', 66)
('ANYTHING', 67)
('ARJUNA', 68)
('ARNOLD', 69)
('AS-IS', 70)
('ASAT', 71)
('ASCII', 72)
('ATTAINING', 73)
('Abandoning', 74)
('Abode', 75)
('About', 76)
('Above', 77)
('Abstaining'

# Byte Pair Algorithm (BPE)

In [67]:
!pip install tiktoken
# Its a fast BPE tokenizer which is used in OpenAI models 



In [68]:
import importlib 
import tiktoken

In [69]:
# Now we can instantiate the BPE Tokenizer from tiktoken 
tokenizer = tiktoken.get_encoding("gpt2")
# So basically this is similar to SimpleTokenizerV1/V2 which we implemented above, it encodes and 
# decodes, simply using 1 word, creating token IDs

In [70]:
# Encode

text = (
    "Hello, do you like Valorant? <|endoftext|> I come from a place where you breadth carbon dioxide"
    "of someunknownPlace"
)
# The end of text is something which was used in GPT training, what it does is, it is used to seperate 1 dataset from another
# lets say you take 2 datasets, 1 from geeta and anotther from quaran, so the developers will use <|endoftext|>
# between both of the datasets while loading it for BPE
# Also as u can see i have loaded a hard word, someunknownplace, the BPE will altomatically handle this, without
# returning any error(out of context error)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"}) 
print(integers)

[15496, 11, 466, 345, 588, 3254, 273, 415, 30, 220, 50256, 314, 1282, 422, 257, 1295, 810, 345, 32483, 6588, 288, 12190, 1651, 69, 617, 34680, 27271]


In [71]:
# Decode

strings = tokenizer.decode(integers)
print(strings)

Hello, do you like Valorant? <|endoftext|> I come from a place where you breadth carbon dioxideof someunknownPlace


In [72]:
# We can see 2 new things here
# <|endoftext|> is assigned to a large token ID
# BPE also encodes and decodes unknown words like someunknownPlace without any error, this was becuase how
# the tokenization was done, ie. byte by byte instead of words or individual   charactors

# Input-Target data pairs using DataLoader

In [73]:
# Inthis section we will implement data-loader that fetches input-target pairs using sliding window approach\
# we will first tokenize our religious book 

In [74]:
with open("gita.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

38552


In [75]:
enc_sample = enc_text[50:]


In [76]:
# now for the prediction task, the most easy way to create the input-target pair for the next work predicction task is to create two variables, X & Y, where x contains the input tokens
# and y contains the target, what are basically the input shifted by 1

# The CONTEXT SIZE determines how many tokens are included in the input

context_size = 25 # length of the input, which means the model is trained to look at a sequence of 4 words, to predict the next word in the sequence
                # So X contains contains the 4 tokens [1,2,3,4] and 
                # the target Y is the next 4 tokes [2,3,4,5]
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}") 

x: [198, 260, 12, 1904, 340, 739, 262, 2846, 286, 262, 4935, 20336, 13789, 3017, 198, 4480, 428, 46566, 393, 2691, 379, 7324, 13, 70, 19028]
y:      [260, 12, 1904, 340, 739, 262, 2846, 286, 262, 4935, 20336, 13789, 3017, 198, 4480, 428, 46566, 393, 2691, 379, 7324, 13, 70, 19028, 13]


In [77]:
# Similarly by processing the input along with the target, which are the inputs shifted by one, we can create a next-word predicction look 
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    
    print (context, "------->", desired)
    # Everything on the left "context" is something which is to the LEFT is the input, LLM would recieve, and the token ID on the RIGHT representes the target target which LLM is supposed to predict

[198] -------> 260
[198, 260] -------> 12
[198, 260, 12] -------> 1904
[198, 260, 12, 1904] -------> 340
[198, 260, 12, 1904, 340] -------> 739
[198, 260, 12, 1904, 340, 739] -------> 262
[198, 260, 12, 1904, 340, 739, 262] -------> 2846
[198, 260, 12, 1904, 340, 739, 262, 2846] -------> 286
[198, 260, 12, 1904, 340, 739, 262, 2846, 286] -------> 262
[198, 260, 12, 1904, 340, 739, 262, 2846, 286, 262] -------> 4935
[198, 260, 12, 1904, 340, 739, 262, 2846, 286, 262, 4935] -------> 20336
[198, 260, 12, 1904, 340, 739, 262, 2846, 286, 262, 4935, 20336] -------> 13789
[198, 260, 12, 1904, 340, 739, 262, 2846, 286, 262, 4935, 20336, 13789] -------> 3017
[198, 260, 12, 1904, 340, 739, 262, 2846, 286, 262, 4935, 20336, 13789, 3017] -------> 198
[198, 260, 12, 1904, 340, 739, 262, 2846, 286, 262, 4935, 20336, 13789, 3017, 198] -------> 4480
[198, 260, 12, 1904, 340, 739, 262, 2846, 286, 262, 4935, 20336, 13789, 3017, 198, 4480] -------> 428
[198, 260, 12, 1904, 340, 739, 262, 2846, 286, 262, 

In [78]:
# now for better understanding, lets repeat the same things with the texts instead of token IDs

for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(tokenizer.decode(context), "------>", tokenizer.decode([desired]))



 ------> re

re ------> -

re- ------> use

re-use ------>  it

re-use it ------>  under

re-use it under ------>  the

re-use it under the ------>  terms

re-use it under the terms ------>  of

re-use it under the terms of ------>  the

re-use it under the terms of the ------>  Project

re-use it under the terms of the Project ------>  Gutenberg

re-use it under the terms of the Project Gutenberg ------>  License

re-use it under the terms of the Project Gutenberg License ------>  included

re-use it under the terms of the Project Gutenberg License included ------> 


re-use it under the terms of the Project Gutenberg License included
 ------> with

re-use it under the terms of the Project Gutenberg License included
with ------>  this

re-use it under the terms of the Project Gutenberg License included
with this ------>  eBook

re-use it under the terms of the Project Gutenberg License included
with this eBook ------>  or

re-use it under the terms of the Project Gutenberg License in

In [79]:
# We've now created the input-target pairs that we can turn into use for the LLM training in upcoming chapters.
# there's only one more task before we can turn the tokens into embeddings:implementing an efficient data loader that iterates over the input dataset and returns the inputs and targets as PyTorch tensors,
#  which can be thought of as multidimensional arrays.

# IMPLEMENTING A DATA LOADER
##### In particular, we are interested in returning two tensors: an input tensor containing the text that the LLM sees and a target tensor that includes the targets for the LLM to predict,



##### Step 1: Tokenize the entire text

##### Step 2: Use a sliding window to chunk the book into overlapping sequences of max_length

##### Step 3: Return the total number of rows in the dataset

##### Step 4: Return a single row from the dataset

In [80]:
# A DataLoader in PyTorch is a utility that helps manage and load data efficiently during training or inference.

# Loads data in batches: Divides your dataset into smaller chunks for processing.
# Shuffles data: Randomizes data to prevent patterns during training.
# Handles multiprocessing: Uses multiple CPU threads to load data faster.
# Why do you need a DataLoader?
# Efficiency: Training models on the entire dataset at once is memory-intensive. DataLoader loads data in manageable chunks (batches).
# Performance: It supports parallel data loading, reducing the time spent waiting for data during training.
# Convenience: Handles dataset shuffling, batching, and even custom transformations seamlessly.
# In short, DataLoader makes training deep learning models efficient and scalable by automating data preparation tasks.

In [81]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]
    



# GPTDatasetV1 is a custom dataset class based on PyTorch's Dataset.
# It prepares input and target data for training a language model.

# Each "row" in the dataset contains:
# 1. input_chunk: A fixed-length sequence of token IDs (numbers representing the text).
# 2. target_chunk: The sequence of token IDs the model should predict based on input_chunk.

# How it works:
# - The entire text is tokenized into a long sequence of token IDs.
# - This long sequence is split into smaller overlapping chunks using a sliding window.
# - Each chunk is stored as input and target pairs for the model to learn from.

# Why use this:
# - The model trains on these smaller chunks, learning to predict the next token(s).
# - Overlapping chunks give the model more context for better predictions.



##### Now lets use this GPT class to load the inputs into BATCHES via a pytorch dataloader

In [82]:
# Steps #
# 1 - initialize the tokenizer

# 2 - create dataset

# 3 - drop_last = True drop the last batch if it is shorter than the specified batch_size
# to prevent loss spikes during training

# 4 - the number of CPU processes to use for preproccessing

In [83]:
def create_dataloader_v1(txt, batch_size=4, max_length=64, stride=32, 
                         shuffle=True, drop_last=True, num_workers=0):
    
    # initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [84]:
# lets test the dataloader with batchsize = 1 for an LLM with a context size of 4,
with open("gita.txt", "r", encoding="utf-8") as f:
    raw_text = f.read() # for reading the file

In [85]:
# Convert dataloader into a python iterator to fatch the next entry via python'd 
# build-in next() function

import torch
print("PyTorch version:", torch.__version__)
dataloader = create_dataloader_v1(
    raw_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

PyTorch version: 2.4.1+cu121
[tensor([[  198,   464,  4935, 20336]]), tensor([[  464,  4935, 20336,   412]])]


# TOKEN EMBEDDINGS
##### meaning of embedding - a mathematical representation of a word, phrase, or piece of text as a high-dimensional vector, capturing its semantic meaning and allowing the model to understand the relationships between different words and concepts within a text

In [86]:
# Firstly we will install GENSIM, it is widely used for creating and using vector representations of text, including word embeddings and document embeddings.
!pip install gensim




In [87]:
import gensim.downloader as api
model = api.load("glove-twitter-25")  # download the model and return as object ready for use

### Example of a word as a vector

In [88]:
word_vectors=model

# Let us look how the vector embedding of a word looks like
print(word_vectors['computer'])  # Example: Accessing the vector for the word 'computer'

[ 0.64005  -0.019514  0.70148  -0.66123   1.1723   -0.58859   0.25917
 -0.81541   1.1708    1.1413   -0.15405  -0.11369  -3.8414   -0.87233
  0.47489   1.1541    0.97678   1.1107   -0.14572  -0.52013  -0.52234
 -0.92349   0.34651   0.061939 -0.57375 ]


In [89]:
# Checking the dimensions (rows in matrix)
print(word_vectors['cat'].shape) # 25 means its dimentions are 25, word2vec-google-news-300 is another model with 300 dimensions

(25,)


In [90]:
# Example of using most_similar
# Now if i ask what will be the most common word if i did King + Woman - Man
print(word_vectors.most_similar(positive=['king', 'woman'], negative=['man'], topn=10)) # LOL as i have used a small data its showing few other things it should not it... but yeah it should be queen
#                                                                                          which u can see on the third place

[('meets', 0.8841924071311951), ('prince', 0.832163393497467), ('queen', 0.8257461190223694), ('’s', 0.8174097537994385), ('crow', 0.813499391078949), ('hunter', 0.8131037950515747), ('father', 0.8115834593772888), ('soldier', 0.81113600730896), ('mercy', 0.8082392811775208), ('hero', 0.8082264065742493)]


In [91]:
# Example of calculating similarity
print(word_vectors.similarity('woman', 'man'))
print(word_vectors.similarity('king', 'queen'))
print(word_vectors.similarity('uncle', 'aunt'))
print(word_vectors.similarity('boy', 'girl'))
print(word_vectors.similarity('nephew', 'niece'))
print(word_vectors.similarity('paper', 'water')) # This has the lest similarity score as the words makes no sensee to each other

0.76541775
0.9202422
0.9338088
0.95961404
0.96716267
0.70250624


In [92]:
# Most similar words
print(word_vectors.most_similar("tower", topn=5)) # it shows that was all are the words which are similar to tower

[('square', 0.9199698567390442), ('gate', 0.9008872509002686), ('bridge', 0.8968756198883057), ('pacific', 0.8831596970558167), ('oak', 0.8686302304267883)]


In [93]:
# Now let us see the vector similarity
import numpy as np
# Words to compare
word1 = 'man'
word2 = 'woman'

word3 = 'semiconductor'
word4 = 'earthworm'

word5 = 'nephew'
word6 = 'niece'

# Calculate the vector difference
vector_difference1 = model[word1] - model[word2]
vector_difference2 = model[word3] - model[word4]
vector_difference3 = model[word5] - model[word6]

# Calculate the magnitude of the vector difference
magnitude_of_difference1 = np.linalg.norm(vector_difference1)
magnitude_of_difference2 = np.linalg.norm(vector_difference2)
magnitude_of_difference3 = np.linalg.norm(vector_difference3)


# Print the magnitude of the difference
print("The magnitude of the difference between '{}' and '{}' is {:.2f}".format(word1, word2, magnitude_of_difference1))
print("The magnitude of the difference between '{}' and '{}' is {:.2f}".format(word3, word4, magnitude_of_difference2))
print("The magnitude of the difference between '{}' and '{}' is {:.2f}".format(word5, word6, magnitude_of_difference3))

# okay the data is small (and shit) thats why we are getting garbage results otherwize the magnitude of semiconductor and earthworm should be MOST and 1st ,3rd should be the least

The magnitude of the difference between 'man' and 'woman' is 3.77
The magnitude of the difference between 'semiconductor' and 'earthworm' is 3.70
The magnitude of the difference between 'nephew' and 'niece' is 1.26


# Positional Embeddings (Encoding word positions)

In [94]:
# Lets consider much more realistic and useful embedding sizes and encode the input token into a 256 dimensional vector representation.
# this dimension would be smaller than GPT-3 as it had used 12,288 dimensions, and 768 dimensions for GPT-2... but still resonable for experiments
# and also we will be using BPE tokenizer that we implemented earlier, which had the vocab_size or total token IDs of 50257

vocab_size = 50257 # no. of token ids
output_dim = 256 # this is the vector dimensions

token_embedding_layer = torch.nn.Embedding(vocab_size, ouput_dim)
# The torch.nn.Embedding layer is used to map integer indices (token IDs) to dense vector representations.
# Creates an embedding lookup table of size (vocab_size, output_dim).
# Each row corresponds to the embedding vector of a token ID.


##### Lets add data loader (ie. data sampling with a sliding window)

In [95]:
max_length = 4 # Total length of tokens
dataloader = create_dataloader_v1(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)

data_iter = iter(dataloader) # Converts the dataloader object into an iterator, enabling you to retrieve batches of data one at a time.
inputs, targets = next(data_iter) # Fetches the next batch of data from the dataloader. It unpacks the batch into inputs (e.g., tokenized text data) and targets (e.g., labels or next-token predictions).

In [96]:
print("Token ID:\n", inputs)
print("\nInput Shape\n", inputs.shape)

Token ID:
 tensor([[  198,   464,  4935, 20336],
        [  412, 10482,   286,   383],
        [16581,   363,   615,   324],
        [   12,    38,  5350,    11],
        [  416, 19200,   198,   198],
        [ 1212, 46566,   318,   329],
        [  262,   779,   286,  2687],
        [ 6609,   379,   645,  1575]])

Input Shape
 torch.Size([8, 4])


In [97]:
# as we can see the Token ID Tensor is 8*4 dimentional, meaning that the data batch consists of 8 text samples with 4 token in a row.
# Now lets use the embedding layer to embed these token IDs into 256-dimenional vectors:

token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [98]:
# As u can see, each token ID is now embedded as a 256-dimentional vector 
# Hence we have completed the vector embedding now lets move forward with positional encoding 
# For a GPT model's absolute embedding approach, we just need to create another embedding layer that has the same dimension as the token_embedding_layer:
context_length = max_length # 4
positional_encoding_layer = torch.nn.Embedding(context_length, output_dim)

In [100]:
position_embeddings = positional_encoding_layer(torch.arange(max_length))
print(position_embeddings.shape)

torch.Size([4, 256])


In [101]:
# As shown in the preceding code example, the input to the position_embeddings is usually a placeholder vector torch.arange(context_length), 
# which contains a sequence of numbers 0, 1, ..., up to the maximum input length − 1.
# The context_length is a variable that represents the supported input size of the LLM.
# Here, we choose it similar to the maximum length of the input text.
# In practice, input text can be longer than the supported context length, in which case we have to truncate the text.


# As we can see, the positional embedding tensor consists of four 256-dimensional vectors. We can now add these directly to the token embeddings,
#  where PyTorch will add the 4x256- dimensional pos_embeddings tensor to each 4x256-dimensional token embedding tensor in each of the 8 batches:


lll

In [102]:
input_embeddings = token_embeddings + position_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
