In [70]:
from dataset import get_dataloader, GPTTokenizer
from model import (GPTModel, inference, token_ids_to_text, text_to_token_ids, generate)
from config import CUSTOM_GPT_CONFIG, GPT2_CONFIG
from torchinfo import summary
from torch.nn import functional as F
from torch import optim
from tqdm import tqdm
from rich import print as pprint
from torch import tensor
import torch

In [71]:
prompts = ["Merry Christmas!!","Whos is smart"]

In [72]:
CUSTOM_GPT_CONFIG

{'vocab_size': 50257,
 'context_length': 1024,
 'emb_dim': 768,
 'n_heads': 12,
 'n_layers': 12,
 'drop_rate': 0.1,
 'qkv_bias': True}

In [73]:
model = GPTModel(CUSTOM_GPT_CONFIG)

In [44]:
summary(model)

Layer (type:depth-idx)                   Param #
GPTModel                                 --
├─Embedding: 1-1                         38,597,376
├─Embedding: 1-2                         786,432
├─Dropout: 1-3                           --
├─Sequential: 1-4                        --
│    └─TransformerBlock: 2-1             --
│    │    └─MultiHeadAttention: 3-1      2,362,368
│    │    └─FeedForward: 3-2             4,722,432
│    │    └─LayerNorm: 3-3               1,536
│    │    └─LayerNorm: 3-4               1,536
│    │    └─Dropout: 3-5                 --
│    └─TransformerBlock: 2-2             --
│    │    └─MultiHeadAttention: 3-6      2,362,368
│    │    └─FeedForward: 3-7             4,722,432
│    │    └─LayerNorm: 3-8               1,536
│    │    └─LayerNorm: 3-9               1,536
│    │    └─Dropout: 3-10                --
│    └─TransformerBlock: 2-3             --
│    │    └─MultiHeadAttention: 3-11     2,362,368
│    │    └─FeedForward: 3-12            4,722,432
│   

In [45]:
total_params = 0
for param in model.parameters():
    total_params += param.numel()
total_params

163037184

In [50]:
# prompts = ["Who is the"]
res =    generate(model=model,
         tokenizer=GPTTokenizer,
         max_new_tokens=30,
         temperature= 0.8,
         DEVICE = 'cpu',
         prompts=prompts,
         context_size=CUSTOM_GPT_CONFIG['context_length'])

RuntimeError: The size of tensor a (50257) must match the size of tensor b (2) at non-singleton dimension 1

In [48]:
pprint(res)

In [13]:
for idx, response in enumerate(res,1):
    pprint(f"Response {idx} : \n\n {response}", end = '\n\n\n')

## Getting OPEN-AI Weights

In [14]:
model_size = "124M"
from gpt_download import download_and_load_gpt2

  from pandas.core import (


In [15]:
model_dir = "downloaded_weights"
settings, params = download_and_load_gpt2(model_size=model_size,
                                          models_dir=model_dir)



File already exists and is up-to-date: downloaded_weights/124M/checkpoint




File already exists and is up-to-date: downloaded_weights/124M/encoder.json




File already exists and is up-to-date: downloaded_weights/124M/hparams.json




File already exists and is up-to-date: downloaded_weights/124M/model.ckpt.data-00000-of-00001




File already exists and is up-to-date: downloaded_weights/124M/model.ckpt.index




File already exists and is up-to-date: downloaded_weights/124M/model.ckpt.meta




File already exists and is up-to-date: downloaded_weights/124M/vocab.bpe


In [16]:
def assign(left, right):
    if left.shape != right.shape:
        raise ValueError(f"Shape mismatch. Left: {left.shape}, Right: {right.shape}")
    return torch.nn.Parameter(torch.tensor(right))

In [19]:
import numpy as np

def load_weights_into_gpt(gpt, params):
    gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])
    gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
    
    for b in range(len(params["blocks"])):
        q_w, k_w, v_w = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["w"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.weight = assign(
            gpt.trf_blocks[b].att.W_query.weight, q_w.T)
        gpt.trf_blocks[b].att.W_key.weight = assign(
            gpt.trf_blocks[b].att.W_key.weight, k_w.T)
        gpt.trf_blocks[b].att.W_value.weight = assign(
            gpt.trf_blocks[b].att.W_value.weight, v_w.T)

        q_b, k_b, v_b = np.split(
            (params["blocks"][b]["attn"]["c_attn"])["b"], 3, axis=-1)
        gpt.trf_blocks[b].att.W_query.bias = assign(
            gpt.trf_blocks[b].att.W_query.bias, q_b)
        gpt.trf_blocks[b].att.W_key.bias = assign(
            gpt.trf_blocks[b].att.W_key.bias, k_b)
        gpt.trf_blocks[b].att.W_value.bias = assign(
            gpt.trf_blocks[b].att.W_value.bias, v_b)

        gpt.trf_blocks[b].att.out_proj.weight = assign(
            gpt.trf_blocks[b].att.out_proj.weight, 
            params["blocks"][b]["attn"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].att.out_proj.bias = assign(
            gpt.trf_blocks[b].att.out_proj.bias, 
            params["blocks"][b]["attn"]["c_proj"]["b"])

        gpt.trf_blocks[b].ff.layers[0].weight = assign(
            gpt.trf_blocks[b].ff.layers[0].weight, 
            params["blocks"][b]["mlp"]["c_fc"]["w"].T)
        gpt.trf_blocks[b].ff.layers[0].bias = assign(
            gpt.trf_blocks[b].ff.layers[0].bias, 
            params["blocks"][b]["mlp"]["c_fc"]["b"])
        gpt.trf_blocks[b].ff.layers[2].weight = assign(
            gpt.trf_blocks[b].ff.layers[2].weight, 
            params["blocks"][b]["mlp"]["c_proj"]["w"].T)
        gpt.trf_blocks[b].ff.layers[2].bias = assign(
            gpt.trf_blocks[b].ff.layers[2].bias, 
            params["blocks"][b]["mlp"]["c_proj"]["b"])

        gpt.trf_blocks[b].norm1.scale = assign(
            gpt.trf_blocks[b].norm1.scale, 
            params["blocks"][b]["ln_1"]["g"])
        gpt.trf_blocks[b].norm1.shift = assign(
            gpt.trf_blocks[b].norm1.shift, 
            params["blocks"][b]["ln_1"]["b"])
        gpt.trf_blocks[b].norm2.scale = assign(
            gpt.trf_blocks[b].norm2.scale, 
            params["blocks"][b]["ln_2"]["g"])
        gpt.trf_blocks[b].norm2.shift = assign(
            gpt.trf_blocks[b].norm2.shift, 
            params["blocks"][b]["ln_2"]["b"])

    gpt.final_norm.scale = assign(gpt.final_norm.scale, params["g"])
    gpt.final_norm.shift = assign(gpt.final_norm.shift, params["b"])
    gpt.out_head.weight = assign(gpt.out_head.weight, params["wte"])



In [20]:
load_weights_into_gpt(model, params)

In [40]:
prompts = ["Marry had a little lamb"] * 2
model.eval()
res =    generate(model=model,
         tokenizer=GPTTokenizer,
         max_new_tokens=100,
         temperature= 0.9,
         DEVICE = 'cpu',
         prompts=prompts,
         context_size=CUSTOM_GPT_CONFIG['context_length'],
         top_K= 50,
         eos_id="<|endoftext|>")

for idx, response in enumerate(res,1):
    pprint(f"Response {idx} : \n\n {response}", end = '\n\n\n')

RuntimeError: The size of tensor a (50257) must match the size of tensor b (2) at non-singleton dimension 1