In [1]:
from model.skeleton_gpt2 import GPT
import torch

# Run the exact GPT2

In [20]:
num_return_sequences = 1
max_length = 50

model = GPT.from_pretrained("gpt2-medium")
model.eval()
model.to("cpu")

loading weights from pretrained gpt: gpt2-medium
------

This is the converted hugging face model:

------ GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (h): ModuleList(
      (0-23): 24 x Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): CasualSelfAttention(
          (c_attn): Linear(in_features=1024, out_features=3072, bias=True)
          (c_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)


GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50257, 1024)
    (wpe): Embedding(1024, 1024)
    (h): ModuleList(
      (0-23): 24 x Block(
        (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (attn): CasualSelfAttention(
          (c_attn): Linear(in_features=1024, out_features=3072, bias=True)
          (c_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
          (gelu): GELU(approximate='tanh')
          (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1024, out_features=50257, bias=False)
)

In [21]:
import tiktoken
enc = tiktoken.get_encoding("gpt2")
tokens = enc.encode("What is the capital of France?")
tokens = torch.tensor(tokens, dtype=torch.long)
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)
x = tokens.to()

In [22]:
from torch.nn import functional as F

torch.manual_seed(42)
while x.size(1) < max_length:
    with torch.no_grad(): # we are not training the model (no gradient calculation), only using it to generate text
        # We pass the current sequence x (tensor of token IDs) to the model
        # The model outputs the logits, which represent the unnormalized probabilities of the next token
        logits, loss = model(x)
        # We start out with a shape of 8 as our input tensor x contains 8 token IDs
        # After each iteration, we add a new token to the sequence, so the shape of x grows by 1
        print(logits.shape)
        # We extract the logits for the last token in the sequence (-1)
        # This is because we only care about predicting the next token based on the current context
        logits = logits[:, -1, :]
        # We apply a softmax function to the logits of the last token
        probs = F.softmax(logits, dim=-1)
        # torch.topk selects the top 50 probabilities (topk_probs) and their corresponding indices (topk_indices)
        # along the last dimension (vocabulary dimension).
        # this is an ordered tensor with the highest probabilities at the beginning
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        # E[11]
        ix = torch.multinomial(topk_probs, 1)
        # We use torch.gather to extract the actual token ID from the top 50 indices (topk_indices) based on the sampled index (ix).
        # xcol becomes a one-dimensional tensor containing the sampled token ID.
        xcol = torch.gather(topk_indices, -1, ix)
        # We concatenate the sampled token ID to the current sequence x.
        x = torch.cat((x, xcol), 1)

torch.Size([5, 7, 50257])
torch.Size([5, 8, 50257])
torch.Size([5, 9, 50257])
torch.Size([5, 10, 50257])
torch.Size([5, 11, 50257])
torch.Size([5, 12, 50257])
torch.Size([5, 13, 50257])
torch.Size([5, 14, 50257])
torch.Size([5, 15, 50257])
torch.Size([5, 16, 50257])
torch.Size([5, 17, 50257])
torch.Size([5, 18, 50257])
torch.Size([5, 19, 50257])
torch.Size([5, 20, 50257])
torch.Size([5, 21, 50257])
torch.Size([5, 22, 50257])
torch.Size([5, 23, 50257])
torch.Size([5, 24, 50257])
torch.Size([5, 25, 50257])
torch.Size([5, 26, 50257])
torch.Size([5, 27, 50257])
torch.Size([5, 28, 50257])
torch.Size([5, 29, 50257])
torch.Size([5, 30, 50257])
torch.Size([5, 31, 50257])
torch.Size([5, 32, 50257])
torch.Size([5, 33, 50257])
torch.Size([5, 34, 50257])
torch.Size([5, 35, 50257])
torch.Size([5, 36, 50257])
torch.Size([5, 37, 50257])
torch.Size([5, 38, 50257])
torch.Size([5, 39, 50257])
torch.Size([5, 40, 50257])
torch.Size([5, 41, 50257])
torch.Size([5, 42, 50257])
torch.Size([5, 43, 50257])
torc

In [23]:
for i in range(num_return_sequences):
    tokens = x[i, :max_length].tolist()
    # Here we decode the token IDs back to text.
    decoded = enc.decode(tokens)
    print(">", decoded)

> What is the capital of France?

In France, the capital is Lyon. All cities have a capital, as does France's other "main city", Nice. If you happen to live in Paris, you can also call the capital a part
> What is the capital of France? This question would take several answers, ranging from what most people consider the capital of France to perhaps New York City (the only U.S. city where it is capital of residents rather than citizens). It would be
> What is the capital of France?

Paris: France.


What is the capital of France? Paris: France in modern times.


How many capitals is the US? London: about the same number as London in 1850.


> What is the capital of France?

The capital of France is Paris. Or as we often say French: Paris is capital of France. What is your capital?

It depends on the country. We have the capital of France (Paris
> What is the capital of France? In ancient times the capital of France was Charlemagne, and he was said to have called it the City of the Sun. 