## 0. Batch Creation with tokenised texts

### 0.1 Inputs for training

In [1]:
batch_size = 3
in_seq_len = 7 # All inputs to be of same length = Either truncated from left or padded with eot_tokens to the left
# The in sequence tokens *cannot* be greater than context length supported by the model

# Samples
txt1 = "Every effort moves you towards your goal"  # Text with token_len=in_seq_len
txt2 = "Every day holds a"  # Text with token_len<in_seq_len
txt3 = "This statement is going to have token length greater than input sequence length"  # Text with token_len>in_seq_len

### 0.2 Tokenize each text in batch & convert to tensor

In [2]:
import torch
import tiktoken

In [3]:
tokenizer = tiktoken.get_encoding("gpt2")

In [4]:
def adjust_tensor_length(tensor, in_seq_len, eot_token):
    if tensor.size(0) > in_seq_len:
        # Truncate from the left
        return tensor[-in_seq_len:]
    elif tensor.size(0) < in_seq_len:
        # Pad with eot_token to the left
        padding = torch.full((in_seq_len - tensor.size(0),), eot_token, dtype=tensor.dtype, device=tensor.device)
        return torch.cat((padding, tensor), dim=0)
    else:
        return tensor

In [5]:
# Adjust size to in_seq_length

txt1_tokens = adjust_tensor_length(torch.tensor(tokenizer.encode(txt1)), in_seq_len, tokenizer.eot_token)
txt2_tokens = adjust_tensor_length(torch.tensor(tokenizer.encode(txt2)), in_seq_len, tokenizer.eot_token)
txt3_tokens = adjust_tensor_length(torch.tensor(tokenizer.encode(txt3)), in_seq_len, tokenizer.eot_token)

In [6]:
print(txt1_tokens)
print(txt2_tokens)
print(txt3_tokens)
print(txt1_tokens.shape, txt2_tokens.shape, txt3_tokens.shape)

tensor([6109, 3626, 6100,  345, 3371,  534, 3061])
tensor([50256, 50256, 50256,  6109,  1110,  6622,   257])
tensor([11241,  4129,  3744,   621,  5128,  8379,  4129])
torch.Size([7]) torch.Size([7]) torch.Size([7])


In [7]:
### 0.3 Create batch
batch = []
batch.append(txt1_tokens)
batch.append(txt2_tokens)
batch.append(txt3_tokens)

batch = torch.stack(batch, dim=0)


print(batch)
batch.shape  # batch_size * in_seq_len

tensor([[ 6109,  3626,  6100,   345,  3371,   534,  3061],
        [50256, 50256, 50256,  6109,  1110,  6622,   257],
        [11241,  4129,  3744,   621,  5128,  8379,  4129]])


torch.Size([3, 7])

## 1. Embedding

In [8]:
vocab_size = 50257  # Size of gpt2 tokenizer used
emb_dim = 10 # Dimension of Embedding to be created; Actual value: 768
context_length = 9 # Context Length supported by the model; Actual value: 1024

In [9]:
import torch.nn as nn

### 1.1 Token Embedding

In [10]:
tok_emb = nn.Embedding(vocab_size, emb_dim)  # Size of vocab dictionary, Size of output vector
tok_embeds = tok_emb(batch)  # Maps each token of each data in input batch to size of output vector

In [11]:
print(tok_embeds[0,:,:])
tok_embeds.shape  # batch_size * in_seq_len * emd_dim

tensor([[-0.6974,  0.4461, -0.9584, -0.5125,  0.1477,  0.1206, -1.6799, -1.2684,
          0.7520,  0.0756],
        [ 0.3967,  1.1326, -1.3573,  0.6575, -0.9811, -0.4606,  0.4231, -0.5453,
         -2.4681,  0.2739],
        [-0.7516, -0.2530,  1.7825, -0.0568,  0.0073,  0.5140, -0.5162, -0.5955,
         -0.8983, -0.0322],
        [-2.0613,  0.7856, -1.1759,  0.4869,  1.4556, -0.5829, -1.5793, -0.4895,
          1.1080,  0.5295],
        [-0.4142, -0.5999,  1.5383,  0.6595, -1.3727, -1.4834, -0.0532,  0.4266,
         -0.4942,  0.1028],
        [ 0.1817,  0.5926,  1.5677,  0.9269,  0.6115, -1.0457,  0.4409,  1.3449,
          1.3554, -0.3465],
        [-0.8319,  1.3842,  1.7604, -2.3966, -1.1095,  0.4116, -0.2831,  0.5327,
          2.0340, -1.0048]], grad_fn=<SliceBackward0>)


torch.Size([3, 7, 10])

### 1.2 Postional Embedding

In [12]:
# Creating tensor of positional ids
pos_ids = torch.arange(in_seq_len, device=batch.device)

pos_emb = nn.Embedding(context_length, emb_dim)  # Size of context length (max supported), Size of output vector
pos_embeds = pos_emb(pos_ids)

In [13]:
print(pos_embeds)
pos_embeds.shape  # in_seq_len * emd_dim

tensor([[-0.8694, -0.5829, -1.2370,  0.6793, -1.3547, -0.4929,  0.8497, -1.6269,
          1.0749, -0.3987],
        [ 0.9512, -0.6089,  0.5841, -1.0966, -0.2790, -0.4522,  1.3523,  1.7756,
         -1.6127,  0.6727],
        [ 2.2905, -1.5452,  0.3904,  0.5729,  1.4988,  0.4038, -0.2181, -0.3068,
          0.3925,  1.5758],
        [ 1.8101, -1.3761, -0.7446, -1.8951,  0.8433,  1.1220,  0.6325, -1.1945,
         -0.7663,  1.2327],
        [-0.2103,  0.7874,  1.1674,  0.1641,  0.8570, -0.8286, -0.3119,  1.5290,
          0.7902, -0.4786],
        [ 0.2054,  2.3139,  0.8673, -0.9339, -1.6844, -1.1535,  2.1952, -0.6071,
          1.2367, -0.3352],
        [ 1.9264, -2.1599,  0.1947,  0.5211,  0.4694,  0.4119,  1.8605,  1.0501,
         -2.6292, -0.5645]], grad_fn=<EmbeddingBackward0>)


torch.Size([7, 10])

### Final Embedding
= Token Embedding + Positional Embedding

In [14]:
final_embedding = tok_embeds + pos_embeds

In [15]:
print(final_embedding[0,:,:])
final_embedding.shape  # batch_size * in_seq_len * emd_dim

tensor([[-1.5668, -0.1368, -2.1955,  0.1668, -1.2070, -0.3723, -0.8302, -2.8953,
          1.8269, -0.3231],
        [ 1.3479,  0.5237, -0.7732, -0.4390, -1.2601, -0.9128,  1.7754,  1.2303,
         -4.0808,  0.9466],
        [ 1.5389, -1.7982,  2.1728,  0.5161,  1.5061,  0.9178, -0.7344, -0.9022,
         -0.5057,  1.5436],
        [-0.2512, -0.5905, -1.9205, -1.4082,  2.2990,  0.5391, -0.9468, -1.6840,
          0.3417,  1.7622],
        [-0.6245,  0.1875,  2.7057,  0.8237, -0.5156, -2.3120, -0.3651,  1.9557,
          0.2960, -0.3758],
        [ 0.3872,  2.9064,  2.4349, -0.0070, -1.0729, -2.1992,  2.6361,  0.7379,
          2.5921, -0.6817],
        [ 1.0945, -0.7756,  1.9551, -1.8756, -0.6402,  0.8235,  1.5774,  1.5828,
         -0.5951, -1.5694]], grad_fn=<SliceBackward0>)


torch.Size([3, 7, 10])

## 2. Dropout

In [16]:
drop_rate = 0.0  # Keeping it zero. Can be tried with smaller prob number
drop_emb = nn.Dropout(drop_rate)

In [17]:
final_embedding = drop_emb(final_embedding)

In [18]:
final_embedding.shape  # batch_size * in_seq_len * emd_dim

torch.Size([3, 7, 10])

## 3. Save to carry forward

In [19]:
torch.save(final_embedding,"intermediate_values/final_embedding.pt")