In [6]:
import torch
import os, sys
import tiktoken

project_root = os.path.abspath(
    os.path.join(os.getcwd(), os.pardir, os.pardir)
)

stage1_root = os.path.join(project_root, "stage1")
sys.path.insert(0, stage1_root)

# now 'src' is a top-level package
from src.gpt2small import GPTModel, GPTConfig124, generate_text


In [7]:
cfg_pt = GPTConfig124(vocab_size=50257, context_length=256, emb_dim=768,
                   n_heads=12, n_layers=12, dropout=0.1, qkv_bias=False)
torch.manual_seed(123)
model = GPTModel(cfg_pt)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_features=768, bias=False)
        (W_k): Linear(in_features=768, out_features=768, bias=False)
        (W_v): Linear(in_features=768, out_features=768, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_q): Linear(in_features=768, out_feat

In [8]:
def text_to_token_ids(text, tokenizer, allowed_special):
    """
    tensor.unsqueeze(dim) inserts a new axis (of size 1) at index dim.
    x = torch.tensor([10, 20, 30])       shape: [3]
    x0 = x.unsqueeze(0)                 shape: [1,3]
    x1 = x.unsqueeze(1)                 shape: [3,1]
    """
    allowed_special = allowed_special or ('<|endoftext|>')
    token_list = tokenizer.encode(text, allowed_special=set(allowed_special))
    ids = torch.tensor(token_list).unsqueeze(0)
    #unsqueeze turns a 1D sequence of token IDs into a 2D batch of size 1.
    #almost all pytorch nn.Modules (embeddings, transformers, etc.)
    # expect inputs of shape (batch_size, seq_len, ...)
    # even f we only have one example, we need to present it as a batch of size 1.
    return ids

def token_ids_to_text(token_ids, tokenizer):
    """
    tensor.squeeze(dim: optional) removes the axis at index dim if its size is 1.
    y = torch.zeros(1, 5, 1)         shape: [1,5,1]
    y0 = y.squeeze(0)               shape: [5,1]
    y1 = y.squeeze(2)               shape: [1,5]
    y2 = y.squeeze()               shape: [5] (all dims 1 are removed)
    """
    flat = token_ids.squeeze(0)
    #squeeze(0) just undoes the batch dimension we previously added,
    # giving back the raw token sequence.
    return tokenizer.decode(flat.tolist())

st_context = "A man told me"
tokenizer = tiktoken.get_encoding('gpt2')

token_ids = generate_text(
    model = model,
    idx = text_to_token_ids(st_context, tokenizer),
    max_new_tokens = 10,
    context_size = cfg_pt.context_length
)

print('Output text: ', token_ids_to_text(token_ids, tokenizer))

Output text:  A man told me accumulation thumbnail Flask 406 propensity Hat lush Tulsolk se


### Text Generation Loss