# Run the exact GPT2

Run GPT2 with the weights provided bz hugging face.

In [1]:
from model.skeleton_gpt2 import GPT
import torch

In [2]:
num_return_sequences = 1
max_length = 100

model = GPT.from_pretrained("gpt2")
model.eval()
model.to("cpu")

  from .autonotebook import tqdm as notebook_tqdm


loading weights from pretrained gpt: gpt2
------

This is the converted hugging face model:

------ GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CasualSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): CasualSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [3]:
import tiktoken
enc = tiktoken.get_encoding("gpt2")
tokens = enc.encode("My name is Johann Sebastian Bach and I am a")
tokens = torch.tensor(tokens, dtype=torch.long)
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)
x = tokens.to()

In [None]:
from torch.nn import functional as F

torch.manual_seed(42)
while x.size(1) < max_length:
    with torch.no_grad(): # we are not training the model (no gradient calculation), only using it to generate text
        # We pass the current sequence x (tensor of token IDs) to the model
        # The model outputs the logits, which represent the unnormalized probabilities of the next token
        logits, loss = model(x)
        # We start out with a shape of 8 as our input tensor x contains 8 token IDs
        # After each iteration, we add a new token to the sequence, so the shape of x grows by 1
        print(logits.shape)
        # We extract the logits for the last token in the sequence (-1)
        # This is because we only care about predicting the next token based on the current context
        # The model's output logits has a shape of (batch_size, sequence_length, vocab_size).
        logits = logits[:, -1, :]
        # We apply a softmax function to the logits of the last token
        probs = F.softmax(logits, dim=-1)
        # This is a key step for sampling-based generation. Instead of simply picking the single most probable token (which can lead to repetitive and uninteresting text), the code uses torch.topk to get the 50 most likely next tokens and their corresponding probabilities.
        # This reduces the sampling space to a manageable size and focuses on the most relevant options.
        # along the last dimension (vocabulary dimension).
        # this is an ordered tensor with the highest probabilities at the beginning
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        # E[11]
        ix = torch.multinomial(topk_probs, 1)
        # We use torch.gather to extract the actual token ID from the top 50 indices (topk_indices) based on the sampled index (ix).
        # xcol becomes a one-dimensional tensor containing the sampled token ID.
        xcol = torch.gather(topk_indices, -1, ix)
        # We concatenate the sampled token ID to the current sequence x.
        x = torch.cat((x, xcol), 1)

torch.Size([1, 10, 50257])
torch.Size([1, 11, 50257])
torch.Size([1, 12, 50257])
torch.Size([1, 13, 50257])
torch.Size([1, 14, 50257])
torch.Size([1, 15, 50257])
torch.Size([1, 16, 50257])
torch.Size([1, 17, 50257])
torch.Size([1, 18, 50257])
torch.Size([1, 19, 50257])
torch.Size([1, 20, 50257])
torch.Size([1, 21, 50257])
torch.Size([1, 22, 50257])
torch.Size([1, 23, 50257])
torch.Size([1, 24, 50257])
torch.Size([1, 25, 50257])
torch.Size([1, 26, 50257])
torch.Size([1, 27, 50257])
torch.Size([1, 28, 50257])
torch.Size([1, 29, 50257])
torch.Size([1, 30, 50257])
torch.Size([1, 31, 50257])
torch.Size([1, 32, 50257])
torch.Size([1, 33, 50257])
torch.Size([1, 34, 50257])
torch.Size([1, 35, 50257])
torch.Size([1, 36, 50257])
torch.Size([1, 37, 50257])
torch.Size([1, 38, 50257])
torch.Size([1, 39, 50257])
torch.Size([1, 40, 50257])
torch.Size([1, 41, 50257])
torch.Size([1, 42, 50257])
torch.Size([1, 43, 50257])
torch.Size([1, 44, 50257])
torch.Size([1, 45, 50257])
torch.Size([1, 46, 50257])
t

In [7]:
for i in range(num_return_sequences):
    tokens = x[i, :max_length].tolist()
    # Here we decode the token IDs back to text.
    decoded = enc.decode(tokens)
    print(">", decoded)

> My name is Johann Sebastian Bach and I am a composer, writer, educator, performer. I specialize in composing the most important works of music and have been practicing my piano since 2004.

I am currently in the process of mastering my first piece "Beethoven's Symphony in Three Decades" since it became a part of the "Moral Piano". For this I have been teaching myself new techniques and ideas.

In my research I've found that "The Piano Concerto"
