In [None]:
from transformers import pipeline, set_seed
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("A language model does", max_length=30, num_return_sequences=5)

In [None]:
# parse and visualize the logfile
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

sz = "124M"

loss_baseline = {
    "124M": 3.2924,
}[sz]
hella2_baseline = { # HellaSwag for GPT-2
    "124M": 0.294463,
    "350M": 0.375224,
    "774M": 0.431986,
    "1558M": 0.488946,
}[sz]
hella3_baseline = { # HellaSwag for GPT-3
    "124M": 0.337,
    "350M": 0.436,
    "774M": 0.510,
    "1558M": 0.547,
}[sz]

# load the log file
with open("log/log.txt", "r") as f:
    lines = f.readlines()

# parse the individual lines, group by stream (train,val,hella)
streams = {}
for line in lines:
    step, stream, val = line.strip().split()
    if stream not in streams:
        streams[stream] = {}
    streams[stream][int(step)] = float(val)

# convert each stream from {step: val} to (steps[], vals[])
# so it's easier for plotting
streams_xy = {}
for k, v in streams.items():
    # get all (step, val) items, sort them
    xy = sorted(list(v.items()))
    # unpack the list of tuples to tuple of lists
    streams_xy[k] = list(zip(*xy))

# create figure
plt.figure(figsize=(16, 6))

# Panel 1: losses: both train and val
plt.subplot(121)
xs, ys = streams_xy["train"] # training loss
ys = np.array(ys)
plt.plot(xs, ys, label=f'nanogpt ({sz}) train loss')
print("Min Train Loss:", min(ys))
xs, ys = streams_xy["val"] # validation loss
plt.plot(xs, ys, label=f'nanogpt ({sz}) val loss')
# horizontal line at GPT-2 baseline
if loss_baseline is not None:
    plt.axhline(y=loss_baseline, color='r', linestyle='--', label=f"OpenAI GPT-2 ({sz}) checkpoint val loss")
plt.xlabel("steps")
plt.ylabel("loss")
plt.yscale('log')
plt.ylim(top=4.0)
plt.legend()
plt.title("Loss")
print("Min Validation Loss:", min(ys))

# Panel 2: HellaSwag eval
plt.subplot(122)
xs, ys = streams_xy["hella"] # HellaSwag eval
ys = np.array(ys)
plt.plot(xs, ys, label=f"nanogpt ({sz})")
# horizontal line at GPT-2 baseline
if hella2_baseline:
    plt.axhline(y=hella2_baseline, color='r', linestyle='--', label=f"OpenAI GPT-2 ({sz}) checkpoint")
if hella3_baseline:
    plt.axhline(y=hella3_baseline, color='g', linestyle='--', label=f"OpenAI GPT-3 ({sz}) checkpoint")
plt.xlabel("steps")
plt.ylabel("accuracy")
plt.legend()
plt.title("HellaSwag eval")
print("Max Hellaswag eval:", max(ys))

In [9]:
from model import GPTConfig, GPT
import torch
import torch.nn.functional as F
import tiktoken

device = "cuda" if torch.cuda.is_available() else "cpu"

model = GPT(GPTConfig(vocab_size=50304))
ckpt = torch.load("log/model_19072.pt", map_location="cpu", weights_only=False)
state = ckpt["model"] if isinstance(ckpt, dict) and "model" in ckpt else ckpt

model.load_state_dict(state)

<All keys matched successfully>

In [None]:
# greedy sampling



model.to(device)
enc = tiktoken.get_encoding("gpt2") 

model.eval()
num_return_sequences = 1
max_length = 25
tokens = enc.encode("A language model does")
tokens = torch.tensor(tokens, dtype=torch.long)
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) # convert to a sample batch
xgen = tokens.to("cpu")
while xgen.size(1) < max_length:
    with torch.no_grad():
        logits, loss, _ = model(xgen) # (B, T, vocab_size)
        logits = logits[:, -1, :] # (B, vocab_size)
        probs = F.softmax(logits, dim=-1)

        xcol = torch.argmax(probs, dim=-1, keepdim=True)
        
        xgen = torch.cat((xgen, xcol), dim=1)
for i in range(num_return_sequences):
    tokens = xgen[i, :max_length].tolist()
    decoded = enc.decode(tokens)
    print(f"{i}: {decoded}")

0: A language model does not have a clear definition of what it means to be a language learner.
The term “


In [7]:

# top k sampling

model.load_state_dict(state)
model.to(device)
enc = tiktoken.get_encoding("gpt2")

model.eval()
num_return_sequences = 1
max_length = 50
tokens = enc.encode("A language model does")
tokens = torch.tensor(tokens, dtype=torch.long)
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) # convert to a sample batch
xgen = tokens.to("cpu")
while xgen.size(1) < max_length:
    with torch.no_grad():
        logits, loss = model(xgen) # (B, T, vocab_size)
        logits = logits[:, -1, :] # (B, vocab_size)
        probs = F.softmax(logits, dim=-1)
        topk_probs, topk_idxs = torch.topk(probs, 50, dim=-1)
        ix = torch.multinomial(topk_probs, 1)
        xcol = torch.gather(topk_idxs, dim=-1, index=ix)
        xgen = torch.cat((xgen, xcol), dim=1)

for i in range(num_return_sequences):
    tokens = xgen[i, :max_length].tolist()
    decoded = enc.decode(tokens)
    print(f"{i}: {decoded}")

0: A language model does not allow for a system to learn. This principle is illustrated in the following illustration from the previous page:
The following example demonstrates the use of the language model.
You can view and modify the language model, or you can


In [8]:
num_return_sequences = 1
max_length = 50
tokens = enc.encode("A language model does")
tokens = torch.tensor(tokens, dtype=torch.long)
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) # convert to a sample batch
xgen = tokens.to("cpu")
while xgen.size(1) < max_length:
    with torch.no_grad():
        logits, loss = model(xgen) # (B, T, vocab_size)
        logits = logits[:, -1, :] # (B, vocab_size)
        probs = F.softmax(logits, dim=-1)

        sorted_probs, sorted_idxs = torch.sort(probs, descending=True, dim=-1)

        cumprobs = torch.cumsum(sorted_probs, dim=-1)

        mask = cumprobs > 0.9
        mask[:, 1:] = mask[:, :-1].clone()
        mask[:, 0] = False

        sorted_probs[mask] = 0.0
        sorted_probs = sorted_probs / sorted_probs.sum(-1, keepdim=True)

        ix = torch.multinomial(sorted_probs, 1) # (B, 1)
        xcol = torch.gather(sorted_idxs, -1, ix) # (B, 1)
        xgen = torch.cat((xgen, xcol), dim=1)
for i in range(num_return_sequences):
    tokens = xgen[i, :max_length].tolist()
    decoded = enc.decode(tokens)
    print(f"{i}: {decoded}")

0: A language model does not predict on how many families depend upon English. For example, the first step of learning to read is trying to learn how to use slang. A reading therapist is a person who has spent most of their careers learning and making oral


In [None]:

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.cuda.nccl as nccl
import torch.distributed as dist


verify_bfloat_support = (
    torch.version.cuda
    and torch.cuda.is_bf16_supported()
    and packaging.version.parse(torch.version.cuda).release >= (11, 0)
    and dist.is_nccl_available()
    and nccl.version() >= (2, 10)
    )

# torch.cuda.is_bf16_supported() does not confirm network can handle it
# just gpu native support
# “The GPU can compute BF16, but NCCL might not be able to reduce BF16.”

# basically means that when using FSDP, calling torch.cuda.is_bf16... only lets
# us know if the GPU supports bf16, but it dn guarantee that the dist
# communication stack can safely use it!
# compute support != communication support
# so the code says nothing about multi-GPU communication
# the communication backend, i.e., NCCL's collective algorithms
# the reduction and accum kernels!!
# correct BF16 arithmetic kernels, accum semantics, packaging/unpackaging of bf16 data
# GPU-GPU comms path, IMPLEMENTED BY NCCL

In [11]:
import torch
from torch.profiler import profile, ProfilerActivity

@torch.no_grad()
def run_kv(model, idx, new_tokens):
    logits, _, past_kv = model(idx, past_kv=None)
    for _ in range(new_tokens):
        next_id = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)
        logits, _, past_kv = model(next_id, past_kv=past_kv)

@torch.no_grad()
def run_no_kv(model, idx, new_tokens):
    for _ in range(new_tokens):
        logits, _, _ = model(idx, past_kv=None)
        next_id = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)
        idx = torch.cat([idx, next_id], dim=1)

def torch_profile(model, device="cpu", B=1, T=128, new_tokens=64):
    model.to(device)
    model.eval()

    idx = torch.randint(0, model.config.vocab_size, (B, T), device=device)

    for name, function in [("no_kv", run_no_kv), ("kv", run_kv)]:
        with profile(activities=[ProfilerActivity.CPU]) as profiler:
            function(model, idx, new_tokens)
        print(f"\n--- {name} ---")
        print(profiler.key_averages().table(sort_by="self_cpu_time_total", row_limit=10))


In [12]:

torch_profile(model)


--- no_kv ---
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                 Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-----------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                          aten::addmm        53.23%        5.979s        55.44%        6.227s       2.027ms          3072  
                                             aten::mm        31.03%        3.486s        31.03%        3.486s      54.464ms            64  
    aten::_scaled_dot_product_flash_attention_for_cpu         3.99%     448.060ms         4.49%     504.540ms     656.954us           768  
                                           aten::gelu         3.73%     418.662ms         3.73%     418.662ms     545.133us           768  
     