In [82]:
import torch.nn as nn
import torch
!pip install tiktoken



In [83]:
class LayerNorm(nn.Module):
  def __init__(self,emb_dim):
    super().__init__()
    self.eps=1e-5
    self.scale= nn.Parameter(torch.ones(emb_dim))
    self.shift= nn.Parameter(torch.zeros(emb_dim))

  def forward(self,x):
    mean= x.mean(dim=-1,keepdim=True)
    var= x.var(dim=-1,keepdim=True,unbiased=False)
    norm_x= (x-mean)/torch.sqrt(var+self.eps)
    return self.scale * norm_x + self.shift

In [84]:
class MultiHeadAttention(nn.Module):
  def __init__(self,d_in,d_out,context_length,dropout,num_heads,qkv_bias=False):
    super().__init__()
    assert(d_out%num_heads==0),\
    "d_out must be divisible by num_heads"
    self.d_out= d_out
    self.num_heads= num_heads
    self.head_dim= d_out//num_heads
    self.W_query= nn.Linear(d_in,d_out,bias=qkv_bias)
    self.W_key= nn.Linear(d_in,d_out,bias=qkv_bias)
    self.W_value= nn.Linear(d_in,d_out,bias=qkv_bias)
    self.out_proj=nn.Linear(d_out,d_out) #linear layer to combine head outputs
    self.dropout= nn.Dropout(dropout)
    self.register_buffer("mask",torch.triu(torch.ones(context_length,context_length),diagonal=1))

  def forward(self,x):
    b,num_tokens,d_in= x.shape
    keys= self.W_key(x) # (b,num_tokens,d_out)
    queries= self.W_query(x)
    values= self.W_value(x)
    # we implicitly split matrix by adding num_heads dim
    keys= keys.view(b,num_tokens,self.num_heads,self.head_dim)
    queries= queries.view(b,num_tokens,self.num_heads,self.head_dim)
    values= values.view(b,num_tokens,self.num_heads,self.head_dim)
    #transpose (b,num_tokens,num_heads,head_dim)->(b,num_heads,num_tokens,head_dim)
    #This dimension helps in parallel computation
    keys= keys.transpose(1,2)
    queries=queries.transpose(1,2)
    values= values.transpose(1,2)
    attn_scores= queries @ keys.transpose(2,3) #(b,num_heads,num_tokens,num_tokens)
    mask_bool= self.mask.bool()[:num_tokens,:num_tokens]
    attn_scores.masked_fill_(mask_bool,-torch.inf)
    attn_weights= torch.softmax(attn_scores/keys.shape[-1]**0.5,dim=-1)
    attn_weights= self.dropout(attn_weights)
    #context vec before tanspose (b,num_heads,num_token,head_dim)
    #after transpose (b,num_tokens,num_heads,head_dim)
    #merging the heads back
    context_vec= (attn_weights @ values).transpose(1,2)
    context_vec= context_vec.contiguous().view(b,num_tokens,self.d_out)
    context_vec= self.out_proj(context_vec)
    return context_vec

In [85]:
class GELU(nn.Module):
  def __init__(self):
    super().__init__()

  def forward(self,x):
    return 0.5*x*(1+torch.tanh(torch.sqrt(torch.tensor(2.0/torch.pi)))*(x+0.044715*torch.pow(x,3)))


In [86]:
class FeedForward(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.layers= nn.Sequential(
        nn.Linear(cfg["emb_dim"],4*cfg["emb_dim"]),
        GELU(),
        nn.Linear(4*cfg["emb_dim"],cfg["emb_dim"])
    )

  def forward(self,x):
    return self.layers(x)

In [87]:
class TransformerBlock(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.att= MultiHeadAttention(
        d_in= cfg["emb_dim"],
        d_out= cfg["emb_dim"],
        context_length= cfg["context_length"],
        num_heads= cfg["n_heads"],
        dropout= cfg["drop_rate"],
        qkv_bias= cfg["qkv_bias"]
    )
    self.ff= FeedForward(cfg)
    self.norm1= LayerNorm(cfg["emb_dim"])
    self.norm2= LayerNorm(cfg["emb_dim"])
    self.drop_shortcut= nn.Dropout(cfg["drop_rate"])

  def forward(self,x):
    shortcut=x
    x= self.norm1(x)
    x= self.att(x)
    x= self.drop_shortcut(x)
    x=shortcut+x

    shortcut=x
    x= self.norm2(x)
    x= self.ff(x)
    x= self.drop_shortcut(x)
    x=x+shortcut

    return x


In [88]:
class GPTModel(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.tok_emb = nn.Embedding(cfg["vocab_size"], cfg["emb_dim"])
        self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
        self.drop_emb = nn.Dropout(cfg["drop_rate"])

        self.trf_blocks = nn.Sequential(
            *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])

        self.final_norm = LayerNorm(cfg["emb_dim"])
        self.out_head = nn.Linear(
            cfg["emb_dim"], cfg["vocab_size"], bias=False
        )

    def forward(self, in_idx):
        batch_size, seq_len = in_idx.shape
        tok_embeds = self.tok_emb(in_idx)
        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))
        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]
        x = self.drop_emb(x)
        x = self.trf_blocks(x)
        x = self.final_norm(x)
        logits = self.out_head(x)
        return logits

In [89]:
GPT_CONFIG_124M = {
    "vocab_size": 50257,   # Vocabulary size
    "context_length": 256, # Shortened context length (orig: 1024)
    "emb_dim": 768,        # Embedding dimension
    "n_heads": 12,         # Number of attention heads
    "n_layers": 12,        # Number of layers
    "drop_rate": 0.1,      # Dropout rate
    "qkv_bias": False      # Query-key-value bias
}
model = GPTModel(GPT_CONFIG_124M)

In [90]:
import os
import requests
import json
import numpy as np
import tensorflow as tf
from tqdm import tqdm
import torch

In [91]:

import os
import requests
import json
import numpy as np
import tensorflow as tf
from tqdm import tqdm

def download_and_load_gpt2(model_size, models_dir):
    allowed_sizes = ("124M", "355M", "774M", "1558M")
    if model_size not in allowed_sizes:
        raise ValueError(f"Model size not in {allowed_sizes}")
    model_dir = os.path.join(models_dir, model_size)
    base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
    filenames = [
        "checkpoint", "encoder.json", "hparams.json",
        "model.ckpt.data-00000-of-00001", "model.ckpt.index",
        "model.ckpt.meta", "vocab.bpe"
    ]
    os.makedirs(model_dir, exist_ok=True)
    for filename in filenames:
        file_url = os.path.join(base_url, model_size, filename)
        file_path = os.path.join(model_dir, filename)
        download_file(file_url, file_path)
    # Load settings and params
    tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
    settings = json.load(open(os.path.join(model_dir, "hparams.json")))
    params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings)

    return settings, params


def download_file(url,destination):
  try:
    response= requests.get(url,stream=True,verify=False)
    file_size=int(response.headers.get("content-length",0))
    if os.path.exists(destination):
      file_size_local= os.path.getsize(destination)
      if file_size==file_size_local:
        print(f"File already exists and is upto date:",{destination})
        return
    block_size=1024
    progress_bar_description= url.split("/")[-1]
    with tqdm(total=file_size,unit="iB",unit_scale=True,desc=progress_bar_description) as progress_bar:
      with open(destination,"wb") as file:
          for chunk in response.iter_content(block_size):
            progress_bar.update(len(chunk))
            file.write(chunk)
  except requests.exceptions.RequestException as e:
    print(f"Error downloading the file:{e}")
    print(f"Please check the url:{url}")


def load_gpt2_params_from_tf_ckpt(ckpt_path,settings):
  params={"blocks":[{} for _ in range(settings["n_layer"])]}

  for name, _ in tf.train.list_variables(ckpt_path):
    variable_array= np.squeeze(tf.train.load_variable(ckpt_path,name))
    variable_name_parts= name.split("/")[1:]
    target_dict=params
    if variable_name_parts[0].startswith("h"):
      layer_number= int(variable_name_parts[0][1:])
      target_dict= params["blocks"][layer_number]
    for key in variable_name_parts[1:-1]:
      target_dict= target_dict.setdefault(key,{})
    last_key= variable_name_parts[-1]
    target_dict[last_key]= variable_array
  return params


In [92]:
torch.save(model.state_dict(),"model.pth")
model.load_state_dict(torch.load("model.pth"))

<All keys matched successfully>

In [93]:
optimizer= torch.optim.AdamW(model.parameters(),lr=0.0004,weight_decay=0.1)
torch.save({
    "model_state_dict":model.state_dict(),
    "optimizer_state_dict":optimizer.state_dict(),
},"model_optimizer.pth")

In [94]:
checkpoint= torch.load("model_optimizer.pth")
model= GPTModel(GPT_CONFIG_124M)
model.load_state_dict(checkpoint["model_state_dict"])
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=0.1)
optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
model.train()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

In [95]:
# load pretrained wt from openai
!pip install tensorflow>=2.15.0 tqdm>=4.66

In [96]:
settings,params= download_and_load_gpt2(model_size="124M",models_dir="gpt2")



File already exists and is upto date: {'gpt2/124M/checkpoint'}
File already exists and is upto date: {'gpt2/124M/encoder.json'}




File already exists and is upto date: {'gpt2/124M/hparams.json'}
File already exists and is upto date: {'gpt2/124M/model.ckpt.data-00000-of-00001'}




File already exists and is upto date: {'gpt2/124M/model.ckpt.index'}
File already exists and is upto date: {'gpt2/124M/model.ckpt.meta'}




File already exists and is upto date: {'gpt2/124M/vocab.bpe'}


In [97]:
print("Settings:",settings)
print("params:",params.keys())

Settings: {'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}
params: dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])


In [98]:
print("word token embedding:",params["wte"])
print(params["wte"].shape)

word token embedding: [[-0.11010301 -0.03926672  0.03310751 ... -0.1363697   0.01506208
   0.04531523]
 [ 0.04034033 -0.04861503  0.04624869 ...  0.08605453  0.00253983
   0.04318958]
 [-0.12746179  0.04793796  0.18410145 ...  0.08991534 -0.12972379
  -0.08785918]
 ...
 [-0.04453601 -0.05483596  0.01225674 ...  0.10435229  0.09783269
  -0.06952604]
 [ 0.1860082   0.01665728  0.04611587 ... -0.09625227  0.07847701
  -0.02245961]
 [ 0.05135201 -0.02768905  0.0499369  ...  0.00704835  0.15519823
   0.12067825]]
(50257, 768)


In [99]:
print(GPT_CONFIG_124M)

{'vocab_size': 50257, 'context_length': 256, 'emb_dim': 768, 'n_heads': 12, 'n_layers': 12, 'drop_rate': 0.1, 'qkv_bias': False}


In [100]:
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}
model_name="gpt2-small (124M)"
NEW_CONFIG= GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])

In [101]:
NEW_CONFIG.update({"context_length": 1024, "qkv_bias": True})
gpt = GPTModel(NEW_CONFIG)

In [102]:
def assign(left,right):
  if left.shape!=right.shape:
    raise ValueError(f"Shape mismatch. Left:{left.shape},Right:{right.shape}")
  return torch.nn.Parameter(torch.tensor(right))

In [103]:
import numpy as np

def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])

    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)

        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight,
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias,
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight,
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias,
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight,
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias,
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale,
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift,
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale,
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift,
            params["blocks"][b]["ln_2"]["b"])

    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])



In [104]:
load_weights_into_gpt(gpt, params)


In [105]:
def generate(model, idx, max_new_tokens, context_size, temperature=0.0, top_k=None, eos_id=None):

    # For-loop is the same as before: Get logits, and only focus on last time step
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]

        # New: Filter logits with top_k sampling
        if top_k is not None:
            # Keep only top_k values
            top_logits, _ = torch.topk(logits, top_k)
            min_val = top_logits[:, -1]
            logits = torch.where(logits < min_val, torch.tensor(float("-inf")).to(logits.device), logits)

        # New: Apply temperature scaling
        if temperature > 0.0:
            logits = logits / temperature

            # Apply softmax to get probabilities
            probs = torch.softmax(logits, dim=-1)  # (batch_size, context_len)

            # Sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1)  # (batch_size, 1)

        # Otherwise same as before: get idx of the vocab entry with the highest logits value
        else:
            idx_next = torch.argmax(logits, dim=-1, keepdim=True)  # (batch_size, 1)

        if idx_next == eos_id:  # Stop generating early if end-of-sequence token is encountered and eos_id is specified
            break

        # Same as before: append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1)  # (batch_size, num_tokens+1)

    return idx

In [106]:
!pip install tiktoken



In [107]:

import tiktoken
def text_to_token_ids(text,tokenizer):
  encoded= tokenizer.encode(text,allowed_special={'<|endoftext|>'})
  encoded_tensor= torch.tensor(encoded).unsqueeze(0) #Adds a dimension of size 1 at the specified position.
  return encoded_tensor

def token_ids_to_text(token_ids,tokenizer):
  flat= token_ids.squeeze(0)
  return tokenizer.decode(flat.tolist())

In [108]:
tokenizer= tiktoken.get_encoding("gpt2")

In [112]:
token_ids = generate(
    model=gpt,
    idx=text_to_token_ids("Every effort moves you", tokenizer),
    max_new_tokens=25,
    context_size=NEW_CONFIG["context_length"],
    top_k=20,
    temperature=1.2
)

print("Output text:\n", token_ids_to_text(token_ids, tokenizer))

Output text:
 Every effort moves you. in and,. "- on the.. to the in
 with the ',. the the a on

