In [47]:
import torch
import os, sys
import tiktoken

project_root = os.path.abspath(
    os.path.join(os.getcwd(), os.pardir, os.pardir)
)

stage1_root = os.path.join(project_root, "stage1")
sys.path.insert(0, stage1_root)

# now 'src' is a top-level package
from src.gpt2small import GPTModel, GPTConfig124, generate_text


In [48]:
cfg_pt = GPTConfig124(vocab_size=50257, context_length=256, emb_dim=768,
                   n_heads=12, n_layers=12, dropout=0.1, qkv_bias=False)
torch.manual_seed(123)
model = GPTModel(cfg_pt)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_features=768, bias=False)
        (W_k): Linear(in_features=768, out_features=768, bias=False)
        (W_v): Linear(in_features=768, out_features=768, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_feat

In [49]:
def text_to_token_ids(text, tokenizer, allowed_special):
    """
    tensor.unsqueeze(dim) inserts a new axis (of size 1) at index dim.
    x = torch.tensor([10, 20, 30])       shape: [3]
    x0 = x.unsqueeze(0)                 shape: [1,3]
    x1 = x.unsqueeze(1)                 shape: [3,1]
    """
    allowed_special = allowed_special or ('<|endoftext|>')
    token_list = tokenizer.encode(text, allowed_special=set(allowed_special))
    ids = torch.tensor(token_list).unsqueeze(0)
    #unsqueeze turns a 1D sequence of token IDs into a 2D batch of size 1.
    #almost all pytorch nn.Modules (embeddings, transformers, etc.)
    # expect inputs of shape (batch_size, seq_len, ...)
    # even f we only have one example, we need to present it as a batch of size 1.
    return ids

def token_ids_to_text(token_ids, tokenizer):
    """
    tensor.squeeze(dim: optional) removes the axis at index dim if its size is 1.
    y = torch.zeros(1, 5, 1)         shape: [1,5,1]
    y0 = y.squeeze(0)               shape: [5,1]
    y1 = y.squeeze(2)               shape: [1,5]
    y2 = y.squeeze()               shape: [5] (all dims 1 are removed)
    """
    flat = token_ids.squeeze(0)
    #squeeze(0) just undoes the batch dimension we previously added,
    # giving back the raw token sequence.
    return tokenizer.decode(flat.tolist())

st_context = "A man told me"
tokenizer = tiktoken.get_encoding('gpt2')

token_ids = generate_text(
    model = model,
    idx = text_to_token_ids(st_context, tokenizer, allowed_special=None),
    max_new_tokens = 10,
    context_size = cfg_pt.context_length
)

print('Output text: ', token_ids_to_text(token_ids, tokenizer))

Output text:  A man told me accumulation thumbnail Flask 406 propensity Hat lush Tulsolk se


### Text Generation Loss

As can be seen from the output above, our model is producing random, non-coherent text. This is because it has not yet undergone training (and because the input size is very limited). Training is done in order to increase the softmax probability of the index positions that correspond to the correct target token position. A non-trained model will simply return the argmax of a rather arbitrary softmax distribution (random vectors) across the vocab size, for each token. The goal with training is then to maximize the chance of selecting the correct token by increasing its selection probability relative to other tokens.

In [50]:
text = """A man told me once that all the bad people
Were needed. Maybe not all, but your fingernails
You need; they are really claws, and we know
Claws. The sharks--what about them?
They make other fish swim faster. The hard-faced men
In black coats who chase you for hours
In dreams--that's the only way to get you
To the shore. Sometimes those hard women
Who abandon you get you to say, "You."
A lazy part of us is like a tumbleweed.
It doesn't move on its own. It takes sometimes
A lot of Depression to get tumbleweeds moving.
Then they blow across three or four States.
This man told me that things work together.
Bad handwriting sometimes leads to new ideas;
And a careless God--who refuses to let you
Eat from the Tree of Knowledge--can lead
To books, and eventually to us. We write
Poems with lies in them, but they help a little."""

tokens = text_to_token_ids(text, tokenizer, allowed_special=None)
print(f'Shape of tokens: {tokens.shape}')
print(f'Tokens:\n {tokens}')

B, T = 2, 4 #(batch_size, seq_len)
data = tokens[0][:8+1]

x = data[:-1].view(B,T) #input tensor
y = data[1:].view(B,T) #target tensor for next token prediction

print(f'Inputs:\n {x}')
print(f'Targets:\n {y}')

with torch.no_grad(): #we are not training yet, just an example
    logits = model(x)
probas = torch.softmax(logits, dim=-1)
print(probas.shape)


Shape of tokens: torch.Size([1, 212])
Tokens:
 tensor([[   32,   582,  1297,   502,  1752,   326,   477,   262,  2089,   661,
           198, 35653,  2622,    13,  6674,   407,   477,    11,   475,   534,
          8038,  1142,  1768,   198,  1639,   761,    26,   484,   389,  1107,
         28421,    11,   290,   356,   760,   198,  2601,  8356,    13,   383,
         27476,   438, 10919,   546,   606,    30,   198,  2990,   787,   584,
          5916,  9422,  5443,    13,   383,  1327,    12, 24903,  1450,   198,
           818,  2042, 30720,   508, 15505,   345,   329,  2250,   198,   818,
         10625,   438,  5562,   338,   262,   691,   835,   284,   651,   345,
           198,  2514,   262, 15191,    13,  8975,   883,  1327,  1466,   198,
          8241,  6871,   345,   651,   345,   284,   910,    11,   366,  1639,
           526,   198,    32, 16931,   636,   286,   514,   318,   588,   257,
         47978, 39054,    13,   198,  1026,  1595,   470,  1445,   319,   663,
     

Note that the dimension of the probability tensor is [2, 4, 50257], the same as the logits output shape of our model (batch_size, seq_len, d_model).

In [51]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print(token_ids)

tensor([[[12434],
         [22182],
         [22418],
         [24106]],

        [[ 9099],
         [30130],
         [40937],
         [34965]]])


The output above yields two sets of outputs, one for each batch in the inputs. Each element in each set is the predicted token IDs of the next word, from each word. Perhaps this is better explained visually.

In [52]:
preds = token_ids.squeeze(-1)
batch_size, seq_len = x.shape

for b in range(batch_size):
    for t in range(seq_len):
        inp_id = x[b,t].item()
        pred_id = preds[b,t].item()

        inp_tok = tokenizer.decode([inp_id])
        pred_tok = tokenizer.decode([pred_id])

        print(f"'{inp_tok}' [{inp_id}] ---> '{pred_tok}' [{pred_id}]")

'A' [32] ---> ' Driver' [12434]
' man' [582] ---> 'NP' [22182]
' told' [1297] ---> ' Munich' [22418]
' me' [502] ---> ' accumulation' [24106]
' once' [1752] ---> 'don' [9099]
' that' [326] ---> ' eagerly' [30130]
' all' [477] ---> ' dogma' [40937]
' the' [262] ---> ' ali' [34965]


In [53]:
print(f'Targets batch 1:\n {token_ids_to_text(y[0], tokenizer)}')
print(f'Outputs batch 1:\n {token_ids_to_text(token_ids[0].flatten(), tokenizer)}')
print(f'Targets batch 2: \n {token_ids_to_text(y[1], tokenizer)}')
print(f'Outputs batch 2:\n {token_ids_to_text(token_ids[1].flatten(), tokenizer)}')

Targets batch 1:
  man told me once
Outputs batch 1:
  DriverNP Munich accumulation
Targets batch 2: 
  that all the bad
Outputs batch 2:
 don eagerly dogma ali


As we can see, the model is producing very wrong, random texts that are vastly different from the target. We need to figure out a way to evaluate the performance of the model's generated text numerically through using some sort of loss metric. Using this loss, we can then implement a training function to robustly and iteratively update the model's weights and improve the generated text. But, in a context like language understanding and text generation, how do you build this loss? We want to measure how far, or how different the generated tokens are from the correct targets. But how do we embed this information in a function to optimize? How do you even begin to define inaccuracy or incorrectness in text generation, which might have no objectively correct next word?