In [65]:
import tensorflow as tf
print(tf.__version__)

2.18.0


In [66]:
import torch
import torch.nn as nn

In [67]:
#GELU Activation function
class GELU(nn.Module):
    def __init__(self,):
        super().__init__()
    def forward(self, x):
        return 0.5*x*(1+torch.tanh(torch.sqrt(torch.tensor(2.0/torch.pi))*(x+0.0447*torch.pow(x,3))))

In [68]:
#Feed Forward Neural network
class FeedForward(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.layers = nn.Sequential(nn.Linear(cfg['emb_dim'], 4*cfg['emb_dim']), #incresing dimesion
                                    GELU(),
                                    nn.Linear(4*cfg['emb_dim'],cfg['emb_dim']))  #coming back to original dimension
    def forward(self,x):
        return self.layers(x)

In [69]:
# Layer Normalization
class LayerNorm(nn.Module):
    def __init__(self, emd_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emd_dim))
        self.shift = nn.Parameter(torch.zeros(emd_dim))
    def forward(self, x):
        mean = x.mean(dim = -1, keepdim =True)
        var = x.var(dim =-1, keepdim = True, unbiased= False)
        norm_x = (x-mean)/(torch.sqrt(var+self.eps))
        return self.scale*norm_x + self.shift

In [70]:
class MultiHeadAttention(nn.Module):
    def __init__(self,d_in,d_out,context_length,dropout,num_heads,qkv_bias = False):
        super().__init__()
        assert(d_out % num_heads==0),\
            "d_out must be divisible by nums head"
        
        self.d_out = d_out
        self.num_head = num_heads
        self.head_dim = d_out//num_heads
        # self.d_in = d_in
        self.w_query = nn.Linear(d_in,d_out, bias=qkv_bias)
        self.w_key = nn.Linear(d_in,d_out, bias=qkv_bias)
        self.w_value = nn.Linear(d_in,d_out, bias=qkv_bias)

        self.out_proj = nn.Linear(d_out, d_out)  # Linear layer to combine head outputs
        self.dropout = nn.Dropout(dropout)
        self.register_buffer('mask',torch.triu(torch.ones(context_length,context_length),diagonal=1))

    def forward(self,x):
        b,num_token,d_in = x.shape

        keys = self.w_key(x) # Shape: (b, num_tokens, d_out)
        queries = self.w_query(x)
        values = self.w_value(x)

        keys = keys.view(b,num_token,self.num_head,self.head_dim)
        queries = queries.view(b,num_token,self.num_head,self.head_dim)
        values = values.view(b,num_token,self.num_head,self.head_dim)

        #grouping by num_heads
        keys = keys.transpose(1,2)
        values = values.transpose(1,2)
        queries = queries.transpose(1,2)

        # calculating attention score
        attn_score = queries @ keys.transpose(2,3)

        # calculating attention weigths,masking, scaling and dropout
        mask_bool = self.mask.bool()[:num_token,:num_token]
        attn_score= attn_score.masked_fill_(mask_bool, - torch.inf)
        attn_weight = torch.softmax(attn_score/keys.shape[-1]**0.5, dim=-1)
        attn_weight = self.dropout(attn_weight)
        
        #calculating the context vector
        context_vector = attn_weight @ values #ntokn x ntoken * ntoken x head_dim
        # trasposing to get all the context vextor togeth
        context_vector = context_vector.transpose(1,2)

        # combining heads 
        context_vector = context_vector.contiguous().view(b,num_token,self.d_out)
        context_vector = self.out_proj(context_vector) # optional projection
        return context_vector

In [71]:
class TransformerBlock(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.attn = MultiHeadAttention(d_in=cfg['emb_dim'], d_out= cfg['emb_dim'],
                                       context_length=cfg['context_length'],
                                       num_heads=cfg['n_heads'], dropout= cfg['drop_rate'],
                                       qkv_bias= cfg['qkv_bias'])
        self.ff = FeedForward(cfg=cfg)
        self.norm1 = LayerNorm(emd_dim=cfg['emb_dim'])
        self.norm2 = LayerNorm(emd_dim=cfg['emb_dim'])

        self.drop_shortcut  = nn.Dropout(cfg['drop_rate'])
    
    def forward(self,x):
        shortcut = x
        x= self.norm1(x)
        x = self.attn(x)
        x = self.drop_shortcut(x)

        x = x+ shortcut

        shortcut =x
        x = self.norm2(x)
        x = self.ff(x)
        x = self.drop_shortcut(x)
        x = x + shortcut

        return x


In [72]:
class GPTModel(nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg['vocab_size'], cfg['emb_dim'])
        self.pos_emb = nn.Embedding(cfg['context_length'], cfg['emb_dim'])
        self.drop_emb = nn.Dropout(cfg['drop_rate'])
        self.trf_blocks = nn.Sequential(*[TransformerBlock(cfg) for _ in range(cfg['n_layers'])])
        self.final_norm  = LayerNorm(cfg['emb_dim'])
        self.out_head = nn.Linear(cfg['emb_dim'],cfg['vocab_size'], bias=False)
    def forward(self,in_idx):
        batch_size, seq_len = in_idx.shape
        token_embded = self.tok_emb(in_idx)
        pos_embded = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        
        x = token_embded+pos_embded
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits



In [73]:
# gpt-2 configurations
GPT_CONFIG_124M= {
    'vocab_size' :50257,
    'context_length': 256,
    'emb_dim':768,
    'n_heads':12,
    'n_layers':12,
    'drop_rate':0.1,
    'qkv_bias':False
}

In [74]:
from zz3_weightDownload import download_and_load_gpt2

In [75]:
from zz3_weightDownload import download_and_load_gpt2

In [76]:
settings, params = download_and_load_gpt2(model_size="124M", models_dir="gpt_2")



File already exists and is up to date




File already exists and is up to date




File already exists and is up to date




File already exists and is up to date




File already exists and is up to date




File already exists and is up to date




File already exists and is up to date


In [77]:
print("Settings:", settings)
print("Parameters dictionry keys :", params.keys())

Settings: {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}
Parameters dictionry keys : dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])


In [78]:
print(params['wte'])
print(params['wte'].shape)

[[-0.11010301 -0.03926672  0.03310751 ... -0.1363697   0.01506208
   0.04531523]
 [ 0.04034033 -0.04861503  0.04624869 ...  0.08605453  0.00253983
   0.04318958]
 [-0.12746179  0.04793796  0.18410145 ...  0.08991534 -0.12972379
  -0.08785918]
 ...
 [-0.04453601 -0.05483596  0.01225674 ...  0.10435229  0.09783269
  -0.06952604]
 [ 0.1860082   0.01665728  0.04611587 ... -0.09625227  0.07847701
  -0.02245961]
 [ 0.05135201 -0.02768905  0.0499369  ...  0.00704835  0.15519823
   0.12067825]]
(50257, 768)


In [79]:
#defining model configuration in a dictionary
model_config = {
    "gpt-2-small(124M)":{"emb_dim":768, "n_layers":12,"n_heads":12},
    "gpt-2-medium(355M)":{"emb_dim":768, "n_layers":24,"n_heads":16},
    "gpt-2-large(124M)":{"emb_dim":768, "n_layers":36,"n_heads":20},
    "gpt-2-xl(124M)":{"emb_dim":768, "n_layers":48,"n_heads":25}
}

In [80]:
#updating base configuration with specific model settings
model_name = "gpt-2-small(124M)"
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_config[model_name])

In [81]:
NEW_CONFIG

{'vocab_size': 50257,
 'context_length': 256,
 'emb_dim': 768,
 'n_heads': 12,
 'n_layers': 12,
 'drop_rate': 0.1,
 'qkv_bias': False}

In [82]:
NEW_CONFIG.update({"context_length":1024, "qkv_bias":True})

In [83]:
NEW_CONFIG

{'vocab_size': 50257,
 'context_length': 1024,
 'emb_dim': 768,
 'n_heads': 12,
 'n_layers': 12,
 'drop_rate': 0.1,
 'qkv_bias': True}

In [84]:
myGpt_2 = GPTModel(NEW_CONFIG)

In [85]:
myGpt_2.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (attn): MultiHeadAttention(
        (w_query): Linear(in_features=768, out_features=768, bias=True)
        (w_key): Linear(in_features=768, out_features=768, bias=True)
        (w_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (attn): MultiHeadAttention(
        (w_query): Linear(in_features

In [86]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch: left = {left.shape}, right = {right.shape}")
    with torch.no_grad():
        left.copy_(torch.tensor(right))
    return left


In [87]:
import numpy as np

def load_weights_to_gpt(gpt,params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight,params['wpe'] )
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
    # loading weights to transformer blocks and it's each layer
    for b in range(len(params['blocks'])):
        #upadting key,query and value weights and bias
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].attn.w_query.weight = assign(
            gpt.trf_blocks[b].attn.w_query.weight, q_w.T)
        gpt.trf_blocks[b].attn.w_key.weight = assign(
            gpt.trf_blocks[b].attn.w_key.weight, k_w.T)
        gpt.trf_blocks[b].attn.w_value.weight = assign(
            gpt.trf_blocks[b].attn.w_value.weight, v_w.T)
        
        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].attn.w_query.bias = assign(
            gpt.trf_blocks[b].attn.w_query.bias, q_b)
        gpt.trf_blocks[b].attn.w_key.bias = assign(
            gpt.trf_blocks[b].attn.w_key.bias, k_b)
        gpt.trf_blocks[b].attn.w_value.bias = assign(
            gpt.trf_blocks[b].attn.w_value.bias, v_b)
        
        #updating out projection weights and bias
        gpt.trf_blocks[b].attn.out_proj.weight = assign(
            gpt.trf_blocks[b].attn.out_proj.weight, 
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].attn.out_proj.bias = assign(
            gpt.trf_blocks[b].attn.out_proj.bias, 
            params["blocks"][b]["attn"]["c_proj"]["b"])
        
        # updating weight of feed forward neural network 
        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight, 
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias, 
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight, 
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias, 
            params["blocks"][b]["mlp"]["c_proj"]["b"])
        
        # updating weights for layer normalization scale and shift
        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale, 
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift, 
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale, 
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift, 
            params["blocks"][b]["ln_2"]["b"])
    # End of transformer blocks
    gpt.final_norm.scale = assign(gpt.final_norm.scale, params['g'])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params['b'])
    gpt.out_head.weight = assign(gpt.out_head.weight, params['wte'])

In [88]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [89]:
load_weights_to_gpt(myGpt_2, params=params)

In [90]:
def text_to_token(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor
def token_to_text(ids, tokenizer):
    flat = ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

In [91]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [92]:
def generate(model,idx,max_new_token,context_size,temp=0.0,top_k=None,eos_id= None):
    for _ in range(max_new_token):
        idx_cond = idx[:,-context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:,-1,:]
        # applying filter for top k
        if top_k is not None:
            top_logits,_  = torch.topk(logits,top_k)
            min_val = top_logits[:,-1]
            logits = torch.where(logits<min_val,torch.tensor(float("-inf")).to(device),logits)
        # Apply temprature scaling
        if temp>0.0:
            logits = logits/temp
            probas = torch.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probas, num_samples=1)
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)
        
        if idx_next==eos_id:
            break
        idx = torch.cat((idx,idx_next),dim=-1)

    return idx


In [95]:
torch.manual_seed(123)
token_ids   = generate(model=myGpt_2, idx=text_to_token("That day was special and a bit lucky for me",tokenizer).to(device),
                       max_new_token=50,context_size=NEW_CONFIG['context_length'],
                       top_k=50,temp=1.5)
print('Output Text: ', token_to_text(token_ids,tokenizer))

Output Text:  That day was special and a bit lucky for me," Icahn told Sporting News from his home in Austin, Texas. "It started off as a little set of circumstances that I couldn't live by, but actually played through it that way. Now I understand it now on how my mom helped
