In [None]:
# -------TRAINING CODE------- #

import time

device = "cpu"
if torch.cuda.is_available():
  device = "cuda"
print(device)

train_loader = DataLoaderLite(B=8, T=1024)
# optim #1
torch.set_float32_matmul_precision('high') 
# high is for tf32 output for float32 matmuls
# also tried 'medium' which is for bf16 -- but we don't use medium we use torch.autocast!!
# we now expect all the matmuls (in Linear layers especially) to run tf32 -- expecting around 8x speedup

model = GPT(GPTConfig())
model.to(device)
model = torch.compile(model) # optim #3 -- torch.compile

losses = []
all_tokens_per_sec = []
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
for i in range(50):
  # let's time
  t0 = time.time()
  x, y = train_loader.next_batch()
  x, y = x.to(device), y.to(device)
  optimizer.zero_grad() # zero out all the gradients first so we go into each step without accumulating loss
  # optim #2 -- torch.autocast : weights are in float32 and activations are in bfloat16 -- only select layers are changed
  with torch.autocast(device_type=device, dtype=torch.float16):
    logits, loss = model(x, y) 
  loss.backward() # backprop
  optimizer.step() # update the params based on the backprop
  torch.cuda.synchronize() # needs to make sure all the threads have completed on the gpu -- makes the cpu wait
  t1 = time.time()
  t = (t1-t0) * 1000 # miliseconds
  losses.append(loss.item())
  tokens_per_sec = (train_loader.B * train_loader.T) / t # a more objective metric which is throughput -- how many tokens are we getting through per second
  all_tokens_per_sec.append(tokens_per_sec)
  print(f"step {i+1}: loss = {loss.item()} | time = {t} | throughput = {tokens_per_sec}")

In [None]:
# -------INFERENCE CODE------- 
Generating outputs from our model 
we are using the pretrained weights we got from hf_model but putting it through our gpt2 model on eval mode

num_return_seq = 5
max_length = 30


model = GPT.from_pretrained("gpt2")
# model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval() # probably does nothing we don't know
model.to('cuda') # move all the tensors to the GPU

import tiktoken
enc = tiktoken.get_encoding('gpt2') # tokenizer for gpt2
tokens = enc.encode("Hello, I'm a language model,") # 8 tokens
x = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).repeat(num_return_seq, 1).to('cuda') # [5, 8]

# tokenized and ready to generate
torch.manual_seed(420)
torch.cuda.manual_seed(420)
while x.size(1) < max_length:
  with torch.no_grad():
    logits = model(x) # goes through the entire network and gives us output logits
    # logits = logits.logits
    logits = logits[:, -1, :]
    probs = F.softmax(logits, dim=-1)
    topk_probs, topk_indices = torch.topk(probs, k=50, dim=-1) # getting the top 50 probbailities -- everything else is set to 0 -- keeps the model on track
    idx_next = torch.multinomial(topk_probs, 1)
    xcol = torch.gather(topk_indices, -1, idx_next)
    x = torch.cat((x, xcol), dim=-1)

for i in range(num_return_seq):
  tokens = x[i, :max_length].tolist()
  decoded = enc.decode(tokens)
  print(">>", decoded)


## -------OUTPUT-------
# >> Hello, I'm a language model, not a computer. You could call me a language model, with the same language as I'm writing. I
# >> Hello, I'm a language model, not a programmer. I'm just doing what you call, writing things instead of just code. But it's
# >> Hello, I'm a language model, that sorta, I'd like to know how it was constructed. So, I've built an
# >> Hello, I'm a language model, a grammar. I'm very careful not to make mistakes or use an unfair definition of "language" to justify
# >> Hello, I'm a language model, I'm an action model. Well, I think this is a good idea. In February, the