In [1]:
import torch
import torch.nn as nn
import math
import torch.nn.functional as F
from dataclasses import dataclass

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


In [3]:
class Block(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self,x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x
    
class MLP(nn.Module):

    def __init__(self,config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd,4*config.n_embd)
        self.gelu = nn.GELU(approximate='tanh')
        self.c_proj = nn.Linear(4*config.n_embd, config.n_embd)
        
    def forward(self,x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)

        return x
    
class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        self.c_attn = nn.Linear(config.n_embd, 3*config.n_embd)
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        self.n_head = config.n_head
        self.n_embd = config.n_embd

        self.register_buffer(
            "bias", torch.tril(torch.ones(config.block_size, config.block_size))
            .view(1, 1, config.block_size, config.block_size)
        )
    
    def forward(self, x):
        B, T, C = x.size()
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2) 

        k = k.view(B, T, self.n_head, C//self.n_head).transpose(1, 2) 
        q = q.view(B, T, self.n_head, C//self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C//self.n_head).transpose(1, 2)

        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1))) 
        att = att.masked_fill(self.bias[:, :, :T, :T] == 0, float('-inf')) # type: ignore

        att = F.softmax(att, dim=-1)
        y = att @ v 
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        return y


In [4]:
@dataclass
class GPT2Config:
    block_size: int = 1024
    vocab_size: int = 50257
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768


class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(config.vocab_size,config.n_embd),
            wpe = nn.Embedding(config.block_size,config.n_embd),
            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
            ln_f = nn.LayerNorm(config.n_embd),
        ))

        self.lm_head = nn.Linear(config.n_embd,config.vocab_size,bias=False)


    @classmethod
    def from_pretrained(cls,model_type):
        "loads pretrained gpt-2 model weights from huggingface"
        assert model_type in {'gpt2','gpt2-medium','gpt2-large','gpt2-xl'}
        from transformers import GPT2LMHeadModel
    
        print(f"loading weights from pretrained gpt: {model_type}")
    
        config_args = {
            'gpt2': dict(n_layer=12,n_head=12,n_embd=768),
            'gpt2-medium': dict(n_layer=24,n_head=16,n_embd=1024),
            'gpt2-large': dict(n_layer=36,n_head=20,n_embd=1280),
            'gpt2-xl': dict(n_layer=48,n_head=25,n_embd=1600),
        }[model_type]
    
        config_args['vocab_size'] = 50257
        config_args['block_size'] = 1024
    
        config = GPT2Config(**config_args)
        model = GPT(config=config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')]
    
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()
    
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')]
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')]
        transposed = ['attn.c_attn.weight','attn.c_proj.weight','mlp.c_fc.weight','mlp.c_proj.weight']
    
        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} vs {len(sd_keys)}"
    
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                assert sd_hf[k].shape[::-1] == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].t())
    
            else:
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])
    
        return model
        
    def forward(self,x):
        batch_size,seq_len = x.size()
    
        assert seq_len<=self.config.block_size, f"can't forward"
        pos = torch.arange(0,seq_len,dtype=torch.long,device=x.device)
        pos_emb = self.transformer.wpe(pos) # type: ignore
        tok_emb = self.transformer.wte(x) # pyright: ignore[reportCallIssue]
        x = tok_emb + pos_emb
        for block in self.transformer.h: # type: ignore
            x = block(x)
        x = self.transformer.ln_f(x) # type: ignore
        logits = self.lm_head(x)
        return logits


In [5]:
num_return_seq = 5
max_length = 100


model = GPT.from_pretrained('gpt2')
model.eval()
model.to('cuda')

import tiktoken
enc = tiktoken.get_encoding('gpt2')
tokens = enc.encode("The meaning of life")
tokens = torch.tensor(tokens,dtype= torch.long)
tokens = tokens.unsqueeze(0).repeat(num_return_seq,1)
x = tokens.to('cuda')

torch.manual_seed(42)
torch.cuda.manual_seed(42)
while x.size(1)<max_length:
    with torch.no_grad():
        logits = model(x)
        logits = logits[:,-1,:]
        probs = F.softmax(logits,dim = -1)
        topk_probs, topk_indices = torch.topk(probs,50,dim=-1)
        ix = torch.multinomial(topk_probs,1)
        xcol = torch.gather(topk_indices,-1,ix)
        x = torch.cat((x,xcol),dim=1)



for i in range(num_return_seq):
    tokens = x[i,:max_length].tolist()
    decoded = enc.decode(tokens)
    print(">",decoded)

loading weights from pretrained gpt: gpt2


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

> The meaning of life is not so much that its members, those who are on it; but how they are united to one body. This is the most important question. For for the human race it is more important to keep alive an organism than to allow life to pass through it. As a result, every organism needs the presence of other organisms. In that sense, the organism in order to thrive is to die and must survive on itself and not be replaced by another organismâ€”if there is other
> The meaning of life or other fundamental things in the universe?"

- An example of this, in a series of events from the Old Testament, which is referenced as scripture for the first time:

"[L]ew that it be a matter of life that I should have, and I should have no other means of obtaining it"

- This refers to the fact that human beings want to possess every life they can.

- "A number of the prophets spoke of that
> The meaning of life in the Jewish religion is eternal and unchanging" (Dukhulam 5:20).

In their view, the 

In [8]:
sd_hf = model.state_dict()
print(sd_hf["lm_head.weight"].shape)
print(sd_hf["transformer.wte.weight"].shape)

torch.Size([50257, 768])
torch.Size([50257, 768])
