In [1]:
from transformers import GPT2LMHeadModel

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
model_hf = GPT2LMHeadModel.from_pretrained("gpt2")

In [None]:
sd_hf = model_hf.state_dict()

for k,v in sd_hf.items():
    print(k, v.shape)

In [None]:
sd_hf["transformer.wpe.weight"].view(-1).shape

In [None]:
import matplotlib.pyplot as plt

plt.imshow(sd_hf["transformer.wpe.weight"], cmap="gray")

In [None]:
# sd_hf["transformer.wpe.weight"][:, 150] # --> channel 150 looking across every position

plt.plot(sd_hf["transformer.wpe.weight"][:, 150])
plt.plot(sd_hf["transformer.wpe.weight"][:, 200])
plt.plot(sd_hf["transformer.wpe.weight"][:, 250]) # "This green channel likes to fire for everything up to about position 900

In [82]:
import os
import tiktoken
import math
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F

# ----------------------------------------------------------

class DataLoaderLite:
	def __init__(self, B, T):
		self.B = B
		self.T = T

		# at init load tokens from disk and store them in memory
		with open('input.txt', 'r') as f:
			text = f.read()
		enc = tiktoken.get_encoding('gpt2')
		tokens = enc.encode(text)
		self.tokens = torch.tensor(tokens)
		print(f"loaded {len(self.tokens)} tokens")
		print(f"1 epoch = {len(self.tokens) // (B * T)} batches")

		# state
		self.current_position = 0

	def next_batch(self):
		B, T = self.B, self.T
		buf = self.tokens[self.current_position:self.current_position + B * T + 1]
		x = (buf[:-1]).view(B, T) # inputs
		y = (buf[1:]).view(B, T) # targets

		self.current_position += B * T

		if self.current_position + (B * T + 1) > len(self.tokens):
			self.current_position = 0
		return x, y

class CausalSelfAttention(nn.Module):

	def __init__(self, config):
		super().__init__()
		assert config.n_embd % config.n_head == 0

		self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd) # key, query, value projections for all heads but in a batch. Saves you from three separate instantiations of nn.Linear
		self.c_proj = nn.Linear(config.n_embd, config.n_embd) # output projection

		self.n_head = config.n_head
		self.n_embd = config.n_embd

		self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size)).view(1, 1, config.block_size, config.block_size))

	def forward(self, x):

		B, T, C = x.size() # batch size, sequence length, embedding dimension (n_embd)

		# Calculate query, key, value for all heads in batch, move head forward in the shape to be a batch dim alongside B
		# nh is "number of heads", hs is "head size", and C is number of channels (nh * hs)
		# e.g. in GPT-2 (124M), n_head=12, hs=64, so nh*hs = 768 channels in the Transformer

		qkv = self.c_attn(x)
		q, k, v = qkv.split(self.n_embd, dim=2)
		k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
		q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
		v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

		# attention materializes the large (T, T) matrix for all queries and keys
		att = q @ k.transpose(-2, -1) * (1.0 / math.sqrt(k.size(-1))) # --> (B, nh, T, T)
		att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf'))
		att = F.softmax(att, dim=-1)

		y = att @ v # (B, nh, T, T) x (B, nh, T, hs) --> (B, nh, T, hs)
		y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side

		# output project
		y = self.c_proj(y)
		return y


class MLP(nn.Module):

	def __init__(self, config):
		super().__init__()
		self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd) # On naming (eg 'c_fc'), we are replicating the GPT2 model
		self.gelu = nn.GELU(approximate='tanh')
		self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)

	def forward(self, x):
		x = self.c_fc(x)
		x = self.gelu(x)
		x = self.c_proj(x)
		return x


class Block(nn.Module):

	def __init__(self, config):
		super().__init__()
		self.ln_1 = nn.LayerNorm(config.n_embd)
		self.attn = CausalSelfAttention(config)
		self.ln_2 = nn.LayerNorm(config.n_embd)
		self.mlp = MLP(config)		

	def forward(self, x):
		x = x + self.attn(self.ln_1(x))
		x = x + self.mlp(self.ln_2(x)) 
		return x



@dataclass
class GPTConfig:
	block_size: int = 1024 # max sequence length
	vocab_size: int = 50257 # number of tokens: 50,000 BPE merges + 256 bytes tokens + 1 <|endoftext|> token
	n_layer: int = 12 # number of layers
	n_head: int = 12 # number of heads
	n_embd: int = 768 # embedding dimension

class GPT(nn.Module):
	
	def __init__(self, config):
		super().__init__()
		self.config = config

		self.transformer = nn.ModuleDict(dict(
			wte = nn.Embedding(config.vocab_size, config.n_embd),
			wpe = nn.Embedding(config.block_size, config.n_embd),
			h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
			ln_f = nn.LayerNorm(config.n_embd)
		))
		self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

		# weight sharing scheme
		self.transformer.wte.weight = self.lm_head.weight

		self.apply(self._init_weights)

	def _init_weights(self, module):
		print(module)


	def forward(self, idx, targets=None):
		# idx is shape (B, T)
		B, T = idx.size()
		assert T <= self.config.block_size, f"Cannot forward sequence of length {T}. Block size is only {self.config.block_size}"
		
		# forward the token and position embeddings
		pos = torch.arange(0, T, dtype=torch.long, device=idx.device) # shape (T)
		pos_emb = self.transformer.wpe(pos) # shape (T, n_embd)
		tok_emb = self.transformer.wte(idx) # shape (B, T, n_embd)
		x = tok_emb + pos_emb
		
		# forward through the blocks of the transformer
		for block in self.transformer.h:
			x = block(x)
		
		# forward the final layernorm and the classifier
		x = self.transformer.ln_f(x)
		logits = self.lm_head(x) # (B, T, vocab_size)

		loss = None
		if targets is not None:
			loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
		return logits, loss


	@classmethod
	def from_pretrained(cls, model_type):
		"""Loads pretrained GPT-2 model weights from huggingface"""
		assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
		from transformers import GPT2LMHeadModel
		print("loading weights from pretrained gpt: %s" % model_type)

		# n_layer, n_head and n_embd are determined from model_type
		config_args = {
			'gpt2':			dict(n_layer=12, n_head=12, n_embd=768), 	# 124M params
			'gpt2-medium':	dict(n_layer=24, n_head=16, n_embd=1024), 	# 350M params
			'gpt2-large':	dict(n_layer=36, n_head=20, n_embd=1280), 	# 774M param
			'gpt2-xl':		dict(n_layer=48, n_head=25, n_embd=1600), 	# 1558M params
		}[model_type]
		config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
		config_args['block_size'] = 1024  # always 1024 for GPT model checkpoints

		# create a from-scratch initialized minGPT model
		config = GPTConfig(**config_args)
		model = GPT(config)
		sd = model.state_dict()
		sd_keys = sd.keys()
		sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # dicard the mask / buffer

		# init a huggingface/transformers model
		model_hf = GPT2LMHeadModel.from_pretrained(model_type)
		sd_hf = model_hf.state_dict()

		# copy while ensuring all of the parameters are aligned and match in names and shapes
		sd_keys_hf = sd_hf.keys()
		sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # dicard the mask / buffer
		sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # dicard the mask / buffer
		transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
		# basically the openai checkppoints use a "Conv1D" module, but we only want to use a vanilla Linear
		# this means we have to transpose these weights when we import them
		assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
		for k in sd_keys_hf:
			if any(k.endswith(w) for w in transposed):
				# special treatment for the Conv1D weights we need to transpose
				assert sd_hf[k].shape[::-1] == sd[k].shape
				with torch.no_grad():
					sd[k].copy_(sd_hf[k].t())
			else:
				assert sd_hf[k].shape == sd[k].shape
				with torch.no_grad():
					sd[k].copy_(sd_hf[k])

		return model

In [None]:
with open("gpt_124.py", "r") as f:
    for i, line in enumerate(f, 1):
        if "\t" in line and " " in line[:len(line) - len(line.lstrip())]:
            print(f"Mixed indentation on line {i}: {repr(line)}")

In [4]:
# attept to autodetect the device
device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device = "mps"
print(f"using device: {device}")

num_return_sequences = 5
max_length = 30

# model = GPT.from_pretrained('gpt2')
model = GPT(GPTConfig())
model.eval()
model.to(device)

import tiktoken
enc = tiktoken.get_encoding('gpt2')
tokens = enc.encode("Hello, I'm a language model,")
tokens = torch.tensor(tokens, dtype=torch.long) # (8,)
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1) # (5, 8)
x = tokens.to(device)
# print(x)

using device: mps
Embedding(50257, 768)
Embedding(1024, 768)
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
Linear(in_features=768, out_features=2304, bias=True)
Linear(in_features=768, out_features=768, bias=True)
CausalSelfAttention(
  (c_attn): Linear(in_features=768, out_features=2304, bias=True)
  (c_proj): Linear(in_features=768, out_features=768, bias=True)
)
LayerNorm((768,), eps=1e-05, elementwise_affine=True)
Linear(in_features=768, out_features=3072, bias=True)
GELU(approximate='tanh')
Linear(in_features=3072, out_features=768, bias=True)
MLP(
  (c_fc): Linear(in_features=768, out_features=3072, bias=True)
  (gelu): GELU(approximate='tanh')
  (c_proj): Linear(in_features=3072, out_features=768, bias=True)
)
Block(
  (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attn): CausalSelfAttention(
    (c_attn): Linear(in_features=768, out_features=2304, bias=True)
    (c_proj): Linear(in_features=768, out_features=768, bias=True)
  )
  (ln_2): LayerNorm((768

In [None]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)
while x.size(1) < max_length:
    with torch.no_grad():
        logits = model(x) # (B, T, vocab_size)
        logits = logits[:, -1, :] # (B, vocab_size), take logits at last position
        probs = F.softmax(logits, dim=-1) # get probabilities
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1) # do top-k sampling of 50 (huggingface pipeline default), topk_probs and topk_indices become (5, 50)
        ix = torch.multinomial(topk_probs, 1) # (B, 1), select a token from top-k probabilities
        xcol = torch.gather(topk_indices, -1, ix) # (B, 1), gather corresponding indices
        x = torch.cat((x, xcol), dim=1) # append to the sequence
        # print(x)



In [None]:
for i in range(num_return_sequences):
    tokens = x[i, :max_length].tolist()
    # print(tokens)
    decoded = enc.decode(tokens)
    print(">", decoded)

In [None]:
with open('input.txt', 'r') as f:
    text = f.read()
data = text[:1000]

tokens = enc.encode(data)
print(tokens)

In [None]:
buf = torch.tensor(tokens[:24+1])
buf = buf.to(device)
x = buf[:-1].view(4, 6)
y = buf[1:].view(4, 6)
x, y

In [None]:
logits = model(x)

In [None]:
# logits.view(-1, logits.size(-1)).shape
logits.size(-1)

In [None]:
buf[:-1]

In [None]:
batch = DataLoaderLite(4, 32)
x, y = batch.next_batch()
x.shape, y.shape

In [None]:
print(sd_hf["lm_head.weight"].shape)
print(sd_hf["transformer.wte.weight"].shape)

In [None]:
print(sd_hf["lm_head.weight"].data_ptr())
print(sd_hf["transformer.wte.weight"].data_ptr())

In [None]:
for module in model.modules():
    if isinstance(module, nn.Linear):
        print(module.weight)

In [None]:
x = torch.zeros(768)
n = 1000
for i in range(n):
    x = x + n**-0.5 * torch.randn(768)
x.std(), x.mean()

In [14]:
params_dict = {pn: p for pn, p in model.named_parameters()}
params_dict.items()

dict_items([('transformer.wte.weight', Parameter containing:
tensor([[-0.0011, -0.0150, -0.0165,  ...,  0.0342, -0.0359,  0.0145],
        [ 0.0035,  0.0245, -0.0040,  ...,  0.0022, -0.0145, -0.0027],
        [-0.0350, -0.0080,  0.0035,  ...,  0.0221,  0.0252, -0.0046],
        ...,
        [-0.0162,  0.0078, -0.0232,  ..., -0.0318,  0.0194,  0.0075],
        [-0.0144,  0.0356, -0.0009,  ...,  0.0071,  0.0067,  0.0221],
        [-0.0197, -0.0053, -0.0341,  ...,  0.0354, -0.0083,  0.0188]],
       device='mps:0', requires_grad=True)), ('transformer.wpe.weight', Parameter containing:
tensor([[-0.2423,  2.9232, -1.8966,  ...,  0.9338,  0.7135, -1.3229],
        [ 0.0465,  1.2547, -0.5670,  ...,  0.9776,  1.0174,  0.6369],
        [ 0.7018, -1.6720,  0.5706,  ..., -0.0042, -0.8983, -1.0521],
        ...,
        [ 0.8860,  2.1170, -0.0767,  ..., -0.2740, -0.7309,  0.2346],
        [-0.7944, -0.9617,  0.7107,  ..., -0.4286,  1.1415,  0.4430],
        [ 0.0820,  0.4727, -0.2112,  ...,  0.312

In [24]:
n=0
for k,v in model.named_parameters():
    print(k)
    n += 1
print(n)

transformer.wte.weight
transformer.wpe.weight
transformer.h.0.ln_1.weight
transformer.h.0.ln_1.bias
transformer.h.0.attn.c_attn.weight
transformer.h.0.attn.c_attn.bias
transformer.h.0.attn.c_proj.weight
transformer.h.0.attn.c_proj.bias
transformer.h.0.ln_2.weight
transformer.h.0.ln_2.bias
transformer.h.0.mlp.c_fc.weight
transformer.h.0.mlp.c_fc.bias
transformer.h.0.mlp.c_proj.weight
transformer.h.0.mlp.c_proj.bias
transformer.h.1.ln_1.weight
transformer.h.1.ln_1.bias
transformer.h.1.attn.c_attn.weight
transformer.h.1.attn.c_attn.bias
transformer.h.1.attn.c_proj.weight
transformer.h.1.attn.c_proj.bias
transformer.h.1.ln_2.weight
transformer.h.1.ln_2.bias
transformer.h.1.mlp.c_fc.weight
transformer.h.1.mlp.c_fc.bias
transformer.h.1.mlp.c_proj.weight
transformer.h.1.mlp.c_proj.bias
transformer.h.2.ln_1.weight
transformer.h.2.ln_1.bias
transformer.h.2.attn.c_attn.weight
transformer.h.2.attn.c_attn.bias
transformer.h.2.attn.c_proj.weight
transformer.h.2.attn.c_proj.bias
transformer.h.2.ln_2

In [31]:
param_dict = {pn: p for pn, p in model.named_parameters()}
param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
decay_params[0]

Parameter containing:
tensor([[-0.0011, -0.0150, -0.0165,  ...,  0.0342, -0.0359,  0.0145],
        [ 0.0035,  0.0245, -0.0040,  ...,  0.0022, -0.0145, -0.0027],
        [-0.0350, -0.0080,  0.0035,  ...,  0.0221,  0.0252, -0.0046],
        ...,
        [-0.0162,  0.0078, -0.0232,  ..., -0.0318,  0.0194,  0.0075],
        [-0.0144,  0.0356, -0.0009,  ...,  0.0071,  0.0067,  0.0221],
        [-0.0197, -0.0053, -0.0341,  ...,  0.0354, -0.0083,  0.0188]],
       device='mps:0', requires_grad=True)

In [100]:
# super simple little MLP
net = torch.nn.Sequential(
    torch.nn.Linear(16, 32),
    torch.nn.GELU(),
    torch.nn.Linear(32, 1)
)
torch.random.manual_seed(42)
x = torch.randn(4, 16)
y = torch.randn(4, 1)
net.zero_grad()
yhat = net(x)
loss = torch.nn.functional.mse_loss(yhat, y)
loss.backward()
print(net[0].weight.grad.view(-1)[:10])

tensor([-0.0150,  0.0011,  0.0042, -0.0040,  0.0059, -0.0080, -0.0078, -0.0138,
        -0.0103, -0.0134])


In [77]:
# now let's do it with grad_accum_steps of 4, and B=1
# the loss objective here is different because 
# accumulation in gradient <---> SUM in loss
# i.e. we instead get:
# L0 = (y[0] - yhat[0])**2
# L1 = (y[1] - yhat[1])**2
# L2 = (y[2] - yhat[2])**2
# L3 = (y[3] - yhat[3])**2
# L = L0 + L1 + L2 + L3
# NOTE: the "normalizer" of 1/4 is lost
net.zero_grad()
for i in range(4):
    yhat = net(x[i])
    loss = torch.nn.functional.mse_loss(yhat, y[i])
    loss /= 4
    loss.backward()
print(net[0].weight.grad.view(-1)[:10])

tensor([-0.0150,  0.0011,  0.0042, -0.0040,  0.0059, -0.0080, -0.0078, -0.0138,
        -0.0103, -0.0134])


In [54]:
params = [p for p in net.parameters()]
params[0].shape

torch.Size([32, 16])

In [70]:
net[0].weight.grad.view(-1)[:10]

tensor([-0.0150,  0.0011,  0.0042, -0.0040,  0.0059, -0.0080, -0.0078, -0.0138,
        -0.0103, -0.0134])

In [79]:
524288 // 32

16384

In [84]:
ddp = int(os.environ.get('RANK', -1)) != -1 
ddp

False

In [99]:
# os.environ.get('RANK', -1)
int(os.environ['PAGER']

ValueError: invalid literal for int() with base 10: 'cat'