In [2]:
import gpt_tests
gpt_tests.test_unidirectional_attn

AttributeError: module 'gpt_tests' has no attribute 'test_unidirectional_attn'

In [2]:
import torch as t
from typing import *
from torch import einsum
from einops import rearrange, reduce, repeat
import gpt_tests
from torch import nn
from torch.nn import Module
from math import sqrt
import bert_sol

## Making the GPT-2 module

In [3]:
class MultiHeadedAttention(Module):
    def __init__(self, hidden_size, num_heads):
        super().__init__()
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.head_size = hidden_size/num_heads
        self.attn_lin = nn.Linear(hidden_size, hidden_size*3)
        self.out_lin = nn.Linear(hidden_size, hidden_size)
    
    def forward(self, x):
        product = self.attn_lin(x)
        seq_len = x.shape[1]
        good_format = rearrange(product, 'b n (qkv h p) -> qkv b h n p', qkv = 3, h = self.num_heads)
        queries, keys, values = good_format[0], good_format[1], good_format[2]
        attn_score = t.einsum('bhfp,bhtp -> bhft', keys, queries) / sqrt(self.head_size)
        
        arange = t.arange(seq_len, device=x.device)
        arange_rows = repeat(arange, 'a -> b a', b = seq_len)
        arange_cols = repeat(arange, 'a -> a b', b = seq_len)
        attn_score[:,:,arange_rows < arange_cols] = -1e4
        
        attn_pattn = t.softmax(attn_score, dim=-2)
        # attn_pattn: b h n n; values: b h n p
        out_by_head = t.einsum('bhft,bhfp->bhtp', attn_pattn, values)
        out = rearrange(out_by_head, 'b h t p -> b t (h p)') # b n hidden_size
        return self.out_lin(out)

In [4]:
gpt_tests.test_unidirectional_attn(MultiHeadedAttention)

AttributeError: module 'gpt_tests' has no attribute 'test_unidirectional_attn'

In [None]:
class GPT2Block(Module):
    def __init__(self, hidden_size, num_heads, dropout, layer_norm_epsilon):
        super().__init__()
        self.ln1 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
        self.attn = MultiHeadedAttention(hidden_size, num_heads)
        self.ln2 = nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
        self.mlp = nn.Sequential(
            nn.Linear(hidden_size, hidden_size*4),
            nn.GELU(),
            nn.Linear(hidden_size*4, hidden_size))
        
    def forward(self, x):
        x1 = self.ln1(x)
        x2 = self.attn(x1) + x
        x3 = self.ln2(x2)
        return self.mlp(x3) + x2
        

In [None]:
gpt_tests.test_gpt_block(GPT2Block)

In [None]:
from dataclasses import dataclass
from torchtyping import TensorType


@dataclass
class GPT2Output:
    logits: TensorType["batch_size", "vocab_size"]
    final_encoding: TensorType["batch_size", "hidden_size"]

    
class GPT2(Module):
    def __init__(self, num_layers, num_heads, vocab_size, hidden_size,
                 max_position_embeddings, dropout, layer_norm_epsilon):
        super().__init__()
        
        self.token_embedding = nn.Parameter(t.randn(vocab_size, hidden_size))
        self.pos_embedding = nn.Parameter(t.randn(max_position_embeddings, hidden_size))
        
        self.dropout = nn.Dropout(dropout)
        
        self.blocks = nn.Sequential(
            *[GPT2Block(hidden_size, num_heads, dropout, layer_norm_epsilon)
              for _ in range(num_layers)],
            nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
        )
        
        
    
    def forward(self, x):
        seq_len = x.shape[1]
        embedding = self.token_embedding[x] + self.pos_embedding[:seq_len]
        
        encoding = self.blocks(self.dropout(embedding))
        final_encoding = encoding[:,-1]
        
        logits = t.einsum('vc,bc->bv', self.token_embedding, final_encoding)
        
        return GPT2Output(logits=logits, final_encoding=final_encoding)
    
    
    
    

In [None]:
gpt_tests.test_gpt(GPT2)

## Loading pretrained weights

In [None]:
my_gpt = GPT2(num_layers=12, num_heads=12, vocab_size=50257, hidden_size=768,
                 max_position_embeddings=1024, dropout=0.1, layer_norm_epsilon=1e-5)
pretrained_gpt = gpt_tests.get_pretrained_gpt()

In [None]:
my_gpt = GPT2(num_layers=12, num_heads=12, vocab_size=50257, hidden_size=768,
                 max_position_embeddings=1024, dropout=0.1, layer_norm_epsilon=1e-5)

In [None]:
def string_replace(s):
    s = s.replace("embedding.weight", "embedding")
    s = s.replace("linear1", "mlp.0")
    s = s.replace("linear2", "mlp.2")
    s = s.replace("ln.", "blocks.12.")
    return s

their_dict = pretrained_gpt.state_dict()
for k in list(their_dict.keys()):
    their_dict[string_replace(k)] = their_dict.pop(k)

In [None]:
my_gpt.load_state_dict(their_dict)

## Efficient text generation

In [None]:
import transformers
%env TOKENIZERS_PARALLELISM=false

tokenizer = transformers.GPT2Tokenizer.from_pretrained("gpt2")
# print(tokenizer(['Hello, I am a sentence.']))


In [None]:
def feed_gpt(model: nn.Module, text: str, tokenizer, top_k: int = 10):
    input_ids: List[int] = tokenizer(text)["input_ids"]
    logits = model(t.tensor([input_ids], dtype=t.long)).logits
    probs = t.softmax(logits, dim=-1)
    top_logit_idxs = t.argsort(logits, descending=True)[0,:top_k]
    top_logit_words = tokenizer.decode(top_logit_idxs)
    print(top_logit_words)
    print(probs[0,top_logit_idxs])

In [None]:
pretrained_gpt.cpu()
feed_gpt(pretrained_gpt, "Students at the machine learning bootcamp really enjoyed the", tokenizer)

In [None]:
def feed_gpt_top(model: nn.Module, input_ids: List[int], top_k: int = 10):
    logits = model(t.tensor([input_ids], dtype=t.long)).logits
    probs = t.softmax(logits, dim=-1)
    return t.argsort(logits, descending=True)[0,0]

In [None]:
my_gpt.eval();

In [None]:
start_str = "The machine learning bootcamp started out nicely. But soon, I got an ominous feeling. Shockingly, I discovered"
input_ids = tokenizer(start_str)["input_ids"]
for i in range(100):
    new_token = feed_gpt_top(my_gpt, input_ids, 1)
    print(tokenizer.decode(new_token), end = " ")
    input_ids.append(new_token)

In [None]:
class GPT2Modified(Module):
    def __init__(self, num_layers, num_heads, vocab_size, hidden_size,
                 max_position_embeddings, dropout, layer_norm_epsilon):
        super().__init__()
        self.token_embedding = nn.Parameter(t.randn(vocab_size, hidden_size))
        self.pos_embedding = nn.Parameter(t.randn(max_position_embeddings, hidden_size))
        self.dropout = nn.Dropout(dropout)
        self.blocks = nn.Sequential(
            *[GPT2Block(hidden_size, num_heads, dropout, layer_norm_epsilon)
              for _ in range(num_layers)],
            nn.LayerNorm(hidden_size, eps=layer_norm_epsilon)
        )
        
    def forward(self, x):
        seq_len = x.shape[1]
        embedding = self.token_embedding[x] + self.pos_embedding[:seq_len]
        encoding = self.blocks(self.dropout(embedding))
        self.encoding = encoding
        final_encoding = encoding[:,-1]
        logits = t.einsum('vc,bc->bv', self.token_embedding, final_encoding)
        return GPT2Output(logits=logits, final_encoding=final_encoding)

In [None]:
my_gpt_modified = GPT2Modified(num_layers=12, num_heads=12, vocab_size=50257, hidden_size=768,
                 max_position_embeddings=1024, dropout=0.1, layer_norm_epsilon=1e-5)

In [None]:
my_gpt_modified.load_state_dict(their_dict)

In [None]:
def create_padded_thing():
    s = 'My life motto: Fortune favors the bold'
    ids = tokenizer(s)["input_ids"]
    thing = [ids[:i]+[0]*(10-i) for i in range(6,10)]
    print([len(t) for t in thing])

print(tokenizer.decode(t.tensor(create_padded_thing())))