In [None]:
# -------TRAINING CODE------- #
import time

device = "cpu"
if torch.cuda.is_available():
  device = "cuda"
print(device)

train_loader = DataLoaderLite(B=4, T=1024)
model = GPT(GPTConfig())
model.to(device)

losses = []
all_tokens_per_sec = []
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
for i in range(50):
  # let's time
  t0 = time.time()
  x, y = train_loader.next_batch()
  x, y = x.to(device), y.to(device)
  optimizer.zero_grad() # zero out all the gradients first so we go into each step without accumulating loss
  logits, loss = model(x, y)
  loss.backward() # backprop
  optimizer.step() # update the params based on the backprop
  torch.cuda.synchronize() # needs to make sure all the threads have completed on the gpu -- makes the cpu wait
  t1 = time.time()
  t = (t1-t0) * 1000 # miliseconds
  losses.append(loss.item())
  tokens_per_sec = train_loader.B * train_loader.T / t # a more objective metric which is throughput -- how many tokens are we getting through per second
  all_tokens_per_sec.append(tokens_per_sec)
  print(f"step {i+1}: loss = {loss.item()} | throughput = {t}")

In [None]:
# -------TRAINING OUTPUT------- 
# cuda
# loaded 338025 tokens
# 1 Epoch = 2640 batches
# step 1: loss = 10.967875480651855
# step 2: loss = 9.800003051757812
# step 3: loss = 9.50924015045166
# step 4: loss = 7.666291236877441
# step 5: loss = 6.684833526611328
# step 6: loss = 5.924807548522949
# step 7: loss = 5.279321670532227
# step 8: loss = 4.746295928955078
# step 9: loss = 4.422515869140625
# step 10: loss = 4.2987494468688965
# step 11: loss = 4.2793073654174805
# step 12: loss = 4.296316623687744
# step 13: loss = 4.316412925720215
# step 14: loss = 4.339062690734863
# step 15: loss = 4.346867084503174
# step 16: loss = 4.324669361114502
# step 17: loss = 4.277787685394287
# step 18: loss = 4.224325180053711
# step 19: loss = 4.16982364654541
# step 20: loss = 4.141805171966553
# step 21: loss = 4.15151309967041
# step 22: loss = 4.181767463684082
# step 23: loss = 4.207672595977783
# step 24: loss = 4.204050064086914
# step 25: loss = 4.189088821411133
# step 26: loss = 4.175002574920654
# step 27: loss = 4.1621198654174805
# step 28: loss = 4.140615940093994
# step 29: loss = 4.144059658050537
# step 30: loss = 4.142584323883057
# step 31: loss = 4.15804386138916
# step 32: loss = 4.151178359985352
# step 33: loss = 4.148890495300293
# step 34: loss = 4.137920379638672
# step 35: loss = 4.131983757019043
# step 36: loss = 4.121450901031494
# step 37: loss = 4.101718425750732
# step 38: loss = 7.623427391052246
# step 39: loss = 4.1219801902771
# step 40: loss = 4.131641864776611
# step 41: loss = 4.149787425994873
# step 42: loss = 4.147574424743652
# step 43: loss = 4.141959190368652
# step 44: loss = 4.137613296508789
# step 45: loss = 4.128176689147949
# step 46: loss = 4.112405776977539
# step 47: loss = 4.105103969573975
# step 48: loss = 4.1128339767456055
# step 49: loss = 4.114365100860596
# step 50: loss = 4.110799789428711


In [None]:
# -------INFERENCE CODE------- 
Generating outputs from our model 
we are using the pretrained weights we got from hf_model but putting it through our gpt2 model on eval mode

num_return_seq = 5
max_length = 30


model = GPT.from_pretrained("gpt2")
# model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval() # probably does nothing we don't know
model.to('cuda') # move all the tensors to the GPU

import tiktoken
enc = tiktoken.get_encoding('gpt2') # tokenizer for gpt2
tokens = enc.encode("Hello, I'm a language model,") # 8 tokens
x = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).repeat(num_return_seq, 1).to('cuda') # [5, 8]

# tokenized and ready to generate
torch.manual_seed(420)
torch.cuda.manual_seed(420)
while x.size(1) < max_length:
  with torch.no_grad():
    logits = model(x) # goes through the entire network and gives us output logits
    # logits = logits.logits
    logits = logits[:, -1, :]
    probs = F.softmax(logits, dim=-1)
    topk_probs, topk_indices = torch.topk(probs, k=50, dim=-1) # getting the top 50 probbailities -- everything else is set to 0 -- keeps the model on track
    idx_next = torch.multinomial(topk_probs, 1)
    xcol = torch.gather(topk_indices, -1, idx_next)
    x = torch.cat((x, xcol), dim=-1)

for i in range(num_return_seq):
  tokens = x[i, :max_length].tolist()
  decoded = enc.decode(tokens)
  print(">>", decoded)


## -------OUTPUT-------
# >> Hello, I'm a language model, not a computer. You could call me a language model, with the same language as I'm writing. I
# >> Hello, I'm a language model, not a programmer. I'm just doing what you call, writing things instead of just code. But it's
# >> Hello, I'm a language model, that sorta, I'd like to know how it was constructed. So, I've built an
# >> Hello, I'm a language model, a grammar. I'm very careful not to make mistakes or use an unfair definition of "language" to justify
# >> Hello, I'm a language model, I'm an action model. Well, I think this is a good idea. In February, the