## 0. Imports

In [1]:
# ====================================================
# Update sys path for imports to work
import sys
# print(f"Before: {"\n".join(sys.path)}")  ## Optional print to check
sys.path.append("../../../LLMFromScratch")
# print(f"After: {"\n".join(sys.path)}")  ## Optional print to check
# ====================================================

In [2]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel

from M0_data.helpers import GPT2Tokenizer as CUSTOM_GPT2Tokenizer
from M1_simple_gpt_model.generate import generate_text
from M1_simple_gpt_model.trial_gpt_model import TrialGPTModel
from M3_weightloading.gpt_download import download_and_load_gpt2_params
from M3_weightloading.manual_load import ManualWeightLoading

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
seed_value = 1000   # For Consistency

## 1. Test String

In [4]:
test_strings = ["India is also called as "]

## 2. Get Reference output from HuggingFace GPT2 Model

In [5]:
# Tokenize
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', skip_special_tokens=True, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token  # Set the padding token
encoded_input = tokenizer.batch_encode_plus(test_strings, return_tensors='pt', truncation=True, padding=True)
print(encoded_input)

# Model
reference_model = GPT2LMHeadModel.from_pretrained('gpt2', pad_token_id=tokenizer.eos_token_id)
torch.manual_seed(seed_value)
output = reference_model.generate(**encoded_input)

# De-Tokenize
tokenizer.batch_decode(output, skip_special_tokens=True)

{'input_ids': tensor([[21569,   318,   635,  1444,   355,   220]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}


['India is also called as \xa0the "India of the future" by the World Bank.\nThe Indian government has been trying']

### 2.0 Original GPT2 Model Architechture

In [6]:
reference_model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

## 4. Our Model

### 4.1. Load and update config to GPT2-124M using gpt2_settings

In [7]:
model_size = "124M"
destination_dir = "intermediates/gpt2"

In [8]:
gpt2_settings, gpt2_params = download_and_load_gpt2_params(model_size, destination_dir)

File already exists and is up-to-date: intermediates/gpt2/124M/checkpoint
File already exists and is up-to-date: intermediates/gpt2/124M/encoder.json
File already exists and is up-to-date: intermediates/gpt2/124M/hparams.json
File already exists and is up-to-date: intermediates/gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: intermediates/gpt2/124M/model.ckpt.index
File already exists and is up-to-date: intermediates/gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: intermediates/gpt2/124M/vocab.bpe


In [9]:
GPT2_124M_PARAMS = {
    "context_length": gpt2_settings["n_ctx"],  # Using from Original GPT2 Model Settings
    "drop_rate": 0.1,
    "emb_dim": gpt2_settings["n_embd"],  # Using from Original GPT2 Model Settings
    "n_heads": gpt2_settings["n_head"],  # Using from Original GPT2 Model Settings
    "n_layers": gpt2_settings["n_layer"],  # Using from Original GPT2 Model Settings
    "qvbias": True,  # Note that the original GPT model initialized the linear layers for the query, key, and value matrices in the multi-head attention module with bias vectors, which is not required or recommended; however, to be able to load the weights correctly, we have to enable these too by setting qkv_bias to True in our implementation, too
    "vocab_size": gpt2_settings["n_vocab"]  # Using from Original GPT2 Model Settings
    }

### 4.2. Load Model

In [10]:
custom_gpt2_model = TrialGPTModel(GPT2_124M_PARAMS)

### 4.3. Assign OpenAI weights to our GPTModel instance

In [11]:
manual_weight_loader = ManualWeightLoading(custom_gpt2_model)
manual_weight_loader.assign(gpt2_params)

Updated Token and Position Embedding weights with incoming parameters.
Updated Layer Norm layer 1 of Transformer Block 0 with scale and shift from incoming params.
Updated MHA of Transformer Block 0 with weights and biases from incoming params.
Updated Layer Norm layer 2 of Transformer Block 0 with scale and shift from incoming params.
Updated FF of Transformer Block 0 with weights and biases from incoming params.
Updated Layer Norm layer 1 of Transformer Block 1 with scale and shift from incoming params.
Updated MHA of Transformer Block 1 with weights and biases from incoming params.
Updated Layer Norm layer 2 of Transformer Block 1 with scale and shift from incoming params.
Updated FF of Transformer Block 1 with weights and biases from incoming params.
Updated Layer Norm layer 1 of Transformer Block 2 with scale and shift from incoming params.
Updated MHA of Transformer Block 2 with weights and biases from incoming params.
Updated Layer Norm layer 2 of Transformer Block 2 with scale 

### 4.4 Our Model Architechture

In [12]:
custom_gpt2_model

TrialGPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (norm_1): LayerNorm()
      (multi_head_attention): MultiHeadAttention(
        (linear_query): Linear(in_features=768, out_features=768, bias=True)
        (linear_key): Linear(in_features=768, out_features=768, bias=True)
        (linear_value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (out_proj_linear): Linear(in_features=768, out_features=768, bias=True)
      )
      (norm_2): LayerNorm()
      (feed_forward): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
          (3): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (1): TransformerBlock(
      (norm_1): 

### 4.5. Test output from assigned GPTModel instance

In [13]:
# Tokenize
custom_gpt2_tokenizer = CUSTOM_GPT2Tokenizer()
tokenized_in_strings = custom_gpt2_tokenizer.tokenize_batch(test_strings)
print(tokenized_in_strings)

# Model
torch.manual_seed(seed_value)
output_tokens = generate_text(custom_gpt2_model, tokenized_in_strings, max_tokens_to_generate=10, context_size=GPT2_124M_PARAMS["context_length"], print_interims=False)

# De-Tokenize
custom_gpt2_tokenizer.detokenize_batch(output_tokens)

tensor([[21569,   318,   635,  1444,   355,   220]])


['India is also called as \xa0"the most important country in the world"']

## Footnote
We know that we loaded the model weights correctly because the model can generate coherent text; if we made even a small mistake, the mode would not be able to do that