## 0. Batch Creation with tokenised texts

### 0.1 Inputs for training

In [103]:
batch_size = 3
in_seq_len = 7 # All inputs to be of same length = Either truncated or padded with 0s
# The in sequence tokens *cannot* be greater than context length supported by the model

# Samples
txt1 = "Every effort moves you towards your goal"  # Text with token_len=in_seq_len
txt2 = "Every day holds a"  # Text with token_len<in_seq_len
txt3 = "This statement is going to have token length greater than input sequence length"  # Text with token_len>in_seq_len

### 0.2 Tokenize each text in batch & convert to tensor

In [104]:
import torch
import tiktoken

In [105]:
tokenizer = tiktoken.get_encoding("gpt2")

In [106]:
# Resize to in_seq_length

txt1_tokens = torch.tensor(tokenizer.encode(txt1)).resize_(in_seq_len)
txt2_tokens = torch.tensor(tokenizer.encode(txt2)).resize_(in_seq_len)
txt3_tokens = torch.tensor(tokenizer.encode(txt3)).resize_(in_seq_len)

In [107]:
print(txt1_tokens)
print(txt2_tokens)
print(txt3_tokens)
print(txt1_tokens.shape, txt2_tokens.shape, txt3_tokens.shape)

tensor([6109, 3626, 6100,  345, 3371,  534, 3061])
tensor([6109, 1110, 6622,  257,    0,    0,    0])
tensor([ 1212,  2643,   318,  1016,   284,   423, 11241])
torch.Size([7]) torch.Size([7]) torch.Size([7])


In [108]:
### 0.3 Create batch
batch = []
batch.append(txt1_tokens)
batch.append(txt2_tokens)
batch.append(txt3_tokens)

batch = torch.stack(batch, dim=0)


print(batch)
batch.shape  # batch_size * in_seq_len

tensor([[ 6109,  3626,  6100,   345,  3371,   534,  3061],
        [ 6109,  1110,  6622,   257,     0,     0,     0],
        [ 1212,  2643,   318,  1016,   284,   423, 11241]])


torch.Size([3, 7])

## 1. Embedding

In [109]:
vocab_size = 50257  # Size of gpt2 tokenizer used
emb_dim = 10 # Dimension of Embedding to be created; Actual value: 768
context_length = 9 # Context Length supported by the model; Actual value: 1024

In [110]:
import torch.nn as nn

### 1.1 Token Embedding

In [111]:
tok_emb = nn.Embedding(vocab_size, emb_dim)  # Size of vocab dictionary, Size of output vector
tok_embeds = tok_emb(batch)  # Maps each token of each data in input batch to size of output vector

In [123]:
print(tok_embeds[0,:,:])
tok_embeds.shape  # batch_size * in_seq_len * emd_dim

tensor([[ 8.0560e-01, -7.2369e-02,  1.6215e+00,  1.2957e+00, -6.7076e-01,
         -4.4406e-01,  9.3164e-02, -9.3621e-01,  2.4426e-01,  2.2015e+00],
        [ 2.5781e-01, -3.3988e+00, -5.5075e-01, -4.7603e-01, -1.5854e+00,
         -1.4190e+00, -5.6897e-01,  6.1440e-01, -1.5085e-03, -3.8581e-01],
        [-9.4619e-01,  1.1671e+00,  4.9519e-01, -2.1238e-01,  4.0831e-01,
          7.5337e-01,  6.7045e-01,  7.4886e-02, -1.1705e-01, -3.5606e-01],
        [ 6.4367e-01, -2.0547e+00,  1.9315e-01, -1.7959e+00,  3.7737e-01,
          3.1932e-01,  1.0136e-01, -3.1545e-01, -4.8681e-01,  1.1560e+00],
        [ 7.5275e-01,  6.9840e-01,  3.4175e-01, -2.8001e+00,  1.3978e+00,
          2.2762e+00,  4.9106e-02,  4.5074e-01, -1.5248e+00,  1.2301e+00],
        [-3.1708e-01, -3.5105e-01,  2.4424e-01,  1.1923e-01,  2.0328e-01,
         -2.2486e-01, -1.0199e+00,  3.2237e-02, -7.0953e-01, -8.7331e-01],
        [-7.7676e-01,  4.4808e-01,  1.8099e-01, -8.8066e-01, -3.3011e-01,
          7.4125e-01, -6.2246e-0

torch.Size([3, 7, 10])

### 1.2 Postional Embedding

In [113]:
# Creating tensor of positional ids
pos_ids = torch.arange(in_seq_len, device=batch.device)

pos_emb = nn.Embedding(context_length, emb_dim)  # Size of context length (max supported), Size of output vector
pos_embeds = pos_emb(pos_ids)

In [114]:
print(pos_embeds)
pos_embeds.shape  # in_seq_len * emd_dim

tensor([[ 1.5175e+00, -3.0224e-01,  1.0007e+00,  9.1725e-01, -6.7031e-01,
          5.5924e-01, -7.2874e-01, -3.9477e-01, -6.8363e-01,  2.1758e-01],
        [ 2.9607e-01, -1.7877e+00,  5.0068e-02,  1.5456e+00,  6.8654e-01,
         -1.3739e-02,  1.4753e-01,  1.3004e+00,  1.2431e+00,  1.1466e+00],
        [-1.2794e+00, -1.4152e+00, -4.3518e-02, -1.3565e+00,  4.2320e-02,
         -9.3943e-01,  8.3320e-01, -1.0907e+00, -1.8053e-01, -1.7541e+00],
        [-6.6483e-01,  1.6275e+00,  1.9352e+00,  8.0503e-01, -6.0275e-01,
         -3.7215e-01,  4.6318e-01,  8.2472e-01,  7.5301e-01,  9.8796e-01],
        [-3.5740e-01, -7.4040e-01, -4.6665e-01,  3.1377e-05, -5.1613e-01,
         -4.8468e-03,  1.7932e+00, -1.1144e+00, -5.2987e-01, -2.6246e-02],
        [-1.3561e+00,  1.2757e-01, -7.7255e-01,  1.6031e+00, -1.2485e+00,
          1.1571e+00, -4.3108e-01,  5.4125e-01, -6.6783e-02,  1.2242e+00],
        [-5.0875e-02,  1.0655e+00,  2.2317e-01,  1.3689e+00, -1.9937e+00,
         -1.3690e+00,  1.7098e+0

torch.Size([7, 10])

### Final Embedding
= Token Embedding + Positional Embedding

In [115]:
final_embedding = tok_embeds + pos_embeds

In [124]:
print(final_embedding[0,:,:])
final_embedding.shape  # batch_size * in_seq_len * emd_dim

tensor([[ 2.3231, -0.3746,  2.6221,  2.2130, -1.3411,  0.1152, -0.6356, -1.3310,
         -0.4394,  2.4190],
        [ 0.5539, -5.1864, -0.5007,  1.0695, -0.8989, -1.4327, -0.4214,  1.9148,
          1.2416,  0.7608],
        [-2.2256, -0.2482,  0.4517, -1.5689,  0.4506, -0.1861,  1.5037, -1.0158,
         -0.2976, -2.1102],
        [-0.0212, -0.4272,  2.1284, -0.9909, -0.2254, -0.0528,  0.5645,  0.5093,
          0.2662,  2.1440],
        [ 0.3954, -0.0420, -0.1249, -2.8000,  0.8817,  2.2713,  1.8423, -0.6637,
         -2.0547,  1.2039],
        [-1.6732, -0.2235, -0.5283,  1.7224, -1.0452,  0.9322, -1.4510,  0.5735,
         -0.7763,  0.3509],
        [-0.8276,  1.5135,  0.4042,  0.4883, -2.3238, -0.6278,  1.0873,  2.3286,
         -0.1649,  0.0505]], grad_fn=<SliceBackward0>)


torch.Size([3, 7, 10])

## 2. Dropout

In [119]:
drop_rate = 0.0  # Keeping it zero. Can be tried with smaller prob number
drop_emb = nn.Dropout(drop_rate)

In [120]:
final_embedding = drop_emb(final_embedding)

In [121]:
final_embedding.shape  # batch_size * in_seq_len * emd_dim

torch.Size([3, 7, 10])

## 3. Save to carry forward

In [122]:
torch.save(final_embedding,"intermediate_values/final_embedding.pt")