## 5.1.1 Using GPT to generate text
You begin by taking the gptmodel class from GPT.py and also the GPT_CONFIG_124M


In [118]:
import torch
from GPT import GPTModel

GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False    
}

torch.manual_seed(123)
model=GPTModel(GPT_CONFIG_124M)
# put the model in evaluation mode - whatever that means
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

In [119]:
import tiktoken
from GPT import generate_text_simple

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

# start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

# token_ids = generate_text_simple(
#     model=model,
#     idx=text_to_token_ids(start_context, tokenizer),
#     max_new_tokens=10,
#     context_size=GPT_CONFIG_124M["context_length"]
# )

# print("Input text:", start_context )
# print("Output text:", token_ids_to_text(token_ids, tokenizer))

In [120]:
# converting input to tokens
targets = torch.tensor([[3626, 6100, 345],
                        [1107, 588, 11311]])

print(targets.shape)

torch.Size([2, 3])


In [121]:
with torch.no_grad():
    logits = model(targets)
probas = torch.softmax(logits, dim=-1)
print(probas.shape)

torch.Size([2, 3, 50257])


In [122]:
token_ids = torch.argmax(probas, dim=-1, keepdim=True)
print("Token IDs:\n", token_ids)

Token IDs:
 tensor([[[16835],
         [44376],
         [44148]],

        [[16657],
         [ 1641],
         [40930]]])


In [123]:
print(f"Targets batch 1: {token_ids_to_text(targets[0], tokenizer)}")
print(f"Outputs batch 1: {token_ids_to_text(token_ids[0].flatten(), tokenizer)}")

Targets batch 1:  effort moves you
Outputs batch 1:  Train inactionProxy


In [124]:
text_idx = 0
target_probas_1 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 1: ", target_probas_1)

text_idx = 1
target_probas_2 = probas[text_idx, [0, 1, 2], targets[text_idx]]
print("Text 2:", target_probas_2)

Text 1:  tensor([3.5706e-05, 1.5782e-05, 8.1219e-06])
Text 2: tensor([1.2870e-05, 3.2012e-05, 6.1263e-06])


In [125]:
log_probas = torch.log(torch.cat((target_probas_1, target_probas_2)))

print(log_probas)

tensor([-10.2402, -11.0567, -11.7209, -11.2606, -10.3494, -12.0029])


In [126]:
avg_log_probas = torch.mean(log_probas)
print("Average log probas: ", avg_log_probas)

Average log probas:  tensor(-11.1051)


In [127]:
neg_avg_log_probas = avg_log_probas * -1 
print(neg_avg_log_probas)

tensor(11.1051)


In [128]:
print("logits shape", logits.shape)
print("targets shape", targets.shape)

logits shape torch.Size([2, 3, 50257])
targets shape torch.Size([2, 3])


In [129]:
logits_flat = logits.flatten(0, 1)
targets_flat = targets.flatten()
print("Flattened logits:", logits_flat.shape)
print("Flattened targets:", targets_flat.shape)

Flattened logits: torch.Size([6, 50257])
Flattened targets: torch.Size([6])


In [130]:
loss = torch.nn.functional.cross_entropy(logits_flat, targets_flat)
print(loss)
perplexity = torch.exp(loss)
print(perplexity)

tensor(11.1051)
tensor(66510.8672)
