In [1]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

In [4]:
#use GPT2 tokenizer
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

hyperparameters

In [14]:
# data hyperparas
seq_len = 8 # aka context length
n_vocab = tokenizer.vocab_size #n

#model hyperparas
embed_dim = 2**6 #64

batch_size=5

In [26]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()

        #embedding matrix
        self.embedding = nn.Embedding(n_vocab, embed_dim)
        self.positions = nn.Embedding(seq_len, embed_dim) 
        
        # final output linear layer (unembeddings)
        self.finalLinear = nn.Linear(embed_dim, n_vocab, bias = False)
        
        # init the k,q,v matrices for attention
        self.layernormA = nn.LayerNorm(embed_dim)
        self.key = nn.Linear(embed_dim,embed_dim, bias=False)
        self.query = nn.Linear(embed_dim,embed_dim, bias=False)
        self.value = nn.Linear(embed_dim,embed_dim, bias=False)
        self.W0 = nn.Linear(embed_dim,embed_dim)

        # final outout layer is tied to token embeddings
        self.finalLinear.weight = nn.Parameter(self.embedding.weight)

    def forward(self,tokx):
        # create token+position embedding
        token_embed = self.embedding(tokx) #[seq_len, embed_dim]
        posit_embed = self.positions(torch.arange(tokx.shape[-1])) #[seq_len, embed_dim]

        #their sum is the ouput of embeddings (the addition will broadcast for 
        x = token_embed + posit_embed #[batch, seq_len,embed_dim]

        #n ------attention sublayer starts here
        
        #layernorm before attention
        x = self.layernormA(x)

        #attention algo
        k = self.key(x)
        q = self.query(x)
        v = self.value(x)

        qk = q@k.transpose(-2,-1) # dot prod b/w query and keys
        qk_scaled = qk* embed_dim**-.5

        #apply mask for future tokens
        pastmask = torch.tril(torch.ones(x.shape[0],seq_len,seq_len))
        qk_scaled[pastmask==0] = -torch.inf

        #softmaxify
        qk_softmax = F.softmax(qk_scaled,dim=-1)

        #final attention mechanism
        y = qk_softmax @ v

        y *= self.W0(y)

        #n ------end attention

        #--o--
        # MLP sublayer would be here
        # --o--

        # y is now shape of [batch, seq_len, embed_dim]
        
        # final output transformation (unembeddings)
        y = self.finalLinear(y) / np.sqrt(embed_dim)
        # now y is [batch, seq_len, n_vocab]
        return y, (pastmask, qk_scaled, qk_softmax) #op some attention matrices for viz        
        #hooks are better in real models to get part of the models like activations, matrices etc.
        

    def generate(self, tokx,temperature=1, n_new_tokens=50):
        # tokx is [batch, tokens]

        for _ in range(n_new_tokens):

            # get predictions, but only from past seq_len tokens
            x = self(tokx[:,-seq_len:])[0] # [batch, seq_len,n_vocab]
            #model pushes into feed fwd task only the most recent 8 tokens
            # begining it start with first 8 tokens you started with
            # then it adds more and more tokens that model generates
            # so, it generates new tokens based on tokens that itself has generated

            
            # extract the final token to predict the next
            x = x[:,-1,:]  # [batch, vocab_size]
            

            # apply softmaxt to get prob values over all tokens in vocab - with temp
            probs = F.softmax(x/temperature,dim=-1)

            #probabilistically sample from distbn
            tokx_next = torch.multinomial(probs, num_samples=1) # [batch,1]
            # print("next token:",tokenizer.decode([tokx_next]))
            #append 
            tokx = torch.cat((tokx, tokx_next),dim=1) #[batch, (tokens+1)]
        return tokx

Claculate logits (model output)

In [46]:
# create data
tokens = tokenizer.encode('I prefer oat milk in my coffee.')
X = torch.tensor(tokens[:-1]).unsqueeze(0) #unsqueeze helps to have first dim as batch
y = torch.tensor(tokens[1:]).unsqueeze(0)

print(X.shape)
print(y.shape)


torch.Size([1, 8])
torch.Size([1, 8])


In [44]:
tokens

[40, 4702, 267, 265, 7545, 287, 616, 6891, 13]

In [45]:
X

tensor([[  40, 4702,  267,  265, 7545,  287,  616, 6891]])

In [47]:
model = Model()
out, attn = model(X) # this calls forward(), not generate()
print(out.shape)

torch.Size([1, 8, 50257])


In [48]:
print(f'Expected loss for random weights {-np.log(1/tokenizer.vocab_size):.3f}')
#this is pure by chance, so -ve log likelihood

print(f'Observedd mean logsoftmax output: {torch.mean(-F.log_softmax(out.detach(),dim=-1)):.3f}')
#take the output of model and take log softmax, avg of all observed results

print(f'Cross entropy loss from pytorch  {F.cross_entropy(out.view(-1,out.shape[-1]), y.view(-1)):.3f}')


#all3 are close

Expected loss for random weights 10.825
Observedd mean logsoftmax output: 10.829
Cross entropy loss from pytorch  10.855


In [52]:
print('time causal mask:\n', attn[0])
print('\nqk_scaled\n', attn[1])
print('\nqk_softmax\n', attn[2])


time causal mask:
 tensor([[[1., 0., 0., 0., 0., 0., 0., 0.],
         [1., 1., 0., 0., 0., 0., 0., 0.],
         [1., 1., 1., 0., 0., 0., 0., 0.],
         [1., 1., 1., 1., 0., 0., 0., 0.],
         [1., 1., 1., 1., 1., 0., 0., 0.],
         [1., 1., 1., 1., 1., 1., 0., 0.],
         [1., 1., 1., 1., 1., 1., 1., 0.],
         [1., 1., 1., 1., 1., 1., 1., 1.]]])

qk_scaled
 tensor([[[ 0.1337,    -inf,    -inf,    -inf,    -inf,    -inf,    -inf,
             -inf],
         [ 0.3333, -0.4072,    -inf,    -inf,    -inf,    -inf,    -inf,
             -inf],
         [-0.2466,  0.3072,  0.0410,    -inf,    -inf,    -inf,    -inf,
             -inf],
         [ 0.6310,  0.4456, -0.1337,  0.1302,    -inf,    -inf,    -inf,
             -inf],
         [ 0.1297, -0.0316, -0.3074, -0.0327,  0.4368,    -inf,    -inf,
             -inf],
         [ 0.6467,  0.0744, -0.1543, -0.3090, -0.0118, -0.0961,    -inf,
             -inf],
         [-0.1792, -0.3290, -0.3364,  0.0769, -0.5122, -0.7752, -

generate text

In [55]:
text = 'When I grow up, I want to be a'
tokens = tokenizer.encode(text)
tokens = torch.tensor(tokens).unsqueeze(0)

generated_tokens = model.generate(tokens,temperature=2,n_new_tokens=10)[0]

tokenizer.decode(generated_tokens.tolist())


'When I grow up, I want to be a dismantle membr searches prayingalam awardingshirt corrupted gauiona'

In [24]:
a=10
class A:
    def __init__(self):
        print("BEFORE,A is: ", a)
        # a+=5
        print("AFTER A is: ", a)

In [25]:
ob= A()

BEFORE,A is:  10
AFTER A is:  10
