In [1]:
# Update sys path for imports to work
import sys
# print(f"Before: {"\n".join(sys.path)}")  ## Optional print to check
sys.path.append("../../../LLMFromScratch")
# print(f"After: {"\n".join(sys.path)}")  ## Optional print to check

## 1. Recreating GPT Download from previous notebook - 0

In [2]:
from M3_weightloading.gpt_download import download_and_load_gpt2_params

In [3]:
model_size = "124M"
destination_dir = "intermediates/gpt2"

In [4]:
gpt2_settings, gpt2_params = download_and_load_gpt2_params(model_size, destination_dir)

File already exists and is up-to-date: intermediates/gpt2/124M/checkpoint
File already exists and is up-to-date: intermediates/gpt2/124M/encoder.json
File already exists and is up-to-date: intermediates/gpt2/124M/hparams.json
File already exists and is up-to-date: intermediates/gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: intermediates/gpt2/124M/model.ckpt.index
File already exists and is up-to-date: intermediates/gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: intermediates/gpt2/124M/vocab.bpe


In [5]:
gpt2_settings, gpt2_params.keys()

({'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12},
 dict_keys(['blocks', 'b', 'g', 'wpe', 'wte']))

## 2. Load our GPTModel and config

### 2.1 Load and update config to GPT2-124M using gpt2_settings

In [6]:
from M1_simple_gpt_model.config import TRIAL_CONFIG_PARAMS
TRIAL_CONFIG_PARAMS

{'context_length': 9,
 'drop_rate': 0.0,
 'emb_dim': 10,
 'n_heads': 2,
 'n_layers': 3,
 'qvbias': False,
 'vocab_size': 50257}

In [7]:
GPT2_124M_PARAMS = {
    "context_length": gpt2_settings["n_ctx"],  # Using from Original GPT2 Model Settings
    "drop_rate": 0.1,
    "emb_dim": gpt2_settings["n_embd"],  # Using from Original GPT2 Model Settings
    "n_heads": gpt2_settings["n_head"],  # Using from Original GPT2 Model Settings
    "n_layers": gpt2_settings["n_layer"],  # Using from Original GPT2 Model Settings
    "qvbias": True,  # Note that the original GPT model initialized the linear layers for the query, key, and value matrices in the multi-head attention module with bias vectors, which is not required or recommended; however, to be able to load the weights correctly, we have to enable these too by setting qkv_bias to True in our implementation, too
    "vocab_size": gpt2_settings["n_vocab"]  # Using from Original GPT2 Model Settings
    }

### 2.2 Load Model

In [8]:
from M1_simple_gpt_model.trial_gpt_model import TrialGPTModel

In [9]:
gpt2_124m_model = TrialGPTModel(GPT2_124M_PARAMS)
gpt2_124m_model.eval()

TrialGPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (norm_1): LayerNorm()
      (multi_head_attention): MultiHeadAttention(
        (linear_query): Linear(in_features=768, out_features=768, bias=True)
        (linear_key): Linear(in_features=768, out_features=768, bias=True)
        (linear_value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (out_proj_linear): Linear(in_features=768, out_features=768, bias=True)
      )
      (drop_out): Dropout(p=0.1, inplace=False)
      (norm_2): LayerNorm()
      (feed_forward): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (1): TransformerBlock(
      (norm_1

In [10]:
# To check Model Params
parameters_dict = gpt2_124m_model.state_dict()
print(len(parameters_dict.keys()))
print(list(parameters_dict.keys())[:19])

210
['tok_emb.weight', 'pos_emb.weight', 'transformer_blocks.0.norm_1.scale', 'transformer_blocks.0.norm_1.shift', 'transformer_blocks.0.multi_head_attention.mask', 'transformer_blocks.0.multi_head_attention.linear_query.weight', 'transformer_blocks.0.multi_head_attention.linear_query.bias', 'transformer_blocks.0.multi_head_attention.linear_key.weight', 'transformer_blocks.0.multi_head_attention.linear_key.bias', 'transformer_blocks.0.multi_head_attention.linear_value.weight', 'transformer_blocks.0.multi_head_attention.linear_value.bias', 'transformer_blocks.0.multi_head_attention.out_proj_linear.weight', 'transformer_blocks.0.multi_head_attention.out_proj_linear.bias', 'transformer_blocks.0.norm_2.scale', 'transformer_blocks.0.norm_2.shift', 'transformer_blocks.0.feed_forward.layers.0.weight', 'transformer_blocks.0.feed_forward.layers.0.bias', 'transformer_blocks.0.feed_forward.layers.2.weight', 'transformer_blocks.0.feed_forward.layers.2.bias']


## 3. Assign OpenAI weights to the corresponding weight tensors from gpt2_params in our GPTModel instance

In [11]:
from M3_weightloading.manual_load import ManualWeightLoading

In [12]:
manual_weight_loader = ManualWeightLoading(gpt2_124m_model)

In [13]:
manual_weight_loader.assign(gpt2_params)

Updated Token and Position Embedding weights with incoming parameters.
Updated Layer Norm layer 1 of Transformer Block 0 with scale and shift from incoming params.
Updated MHA of Transformer Block 0 with weights and biases from incoming params.
Updated Layer Norm layer 2 of Transformer Block 0 with scale and shift from incoming params.
Updated FF of Transformer Block 0 with weights and biases from incoming params.
Updated Layer Norm layer 1 of Transformer Block 1 with scale and shift from incoming params.
Updated MHA of Transformer Block 1 with weights and biases from incoming params.
Updated Layer Norm layer 2 of Transformer Block 1 with scale and shift from incoming params.
Updated FF of Transformer Block 1 with weights and biases from incoming params.
Updated Layer Norm layer 1 of Transformer Block 2 with scale and shift from incoming params.
Updated MHA of Transformer Block 2 with weights and biases from incoming params.
Updated Layer Norm layer 2 of Transformer Block 2 with scale 

## 3. Test assigned GPTModel instance

In [14]:
import torch

from M0_data.helpers import GPT2Tokenizer
from M1_simple_gpt_model.generate import generate_text

In [38]:
# 1. Test Strings
test_strings = ["Every effort moves you towards your goal", "correct the spelling of a the word: whatver"]

# 2. Tokenizer
tokenizer = GPT2Tokenizer()

# 3. Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt2_124m_model.to(device)

TrialGPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (norm_1): LayerNorm()
      (multi_head_attention): MultiHeadAttention(
        (linear_query): Linear(in_features=768, out_features=768, bias=True)
        (linear_key): Linear(in_features=768, out_features=768, bias=True)
        (linear_value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (out_proj_linear): Linear(in_features=768, out_features=768, bias=True)
      )
      (drop_out): Dropout(p=0.1, inplace=False)
      (norm_2): LayerNorm()
      (feed_forward): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
    (1): TransformerBlock(
      (norm_1

In [39]:
#  1. Tokenize - Convert words to token IDs
tokenized_in_strings = tokenizer.tokenize_batch(test_strings, max_in_seq_len=15)

#  2. Generate output with the model
output_tokens = generate_text(gpt2_124m_model, tokenized_in_strings, max_tokens_to_generate=5, context_size=GPT2_124M_PARAMS["context_length"], print_interims=False)

 #  3. Detokenize - Convert Token IDs to Words
output = tokenizer.detokenize_batch(output_tokens)
print(output)

['Every effort moves you towards your goal!!!!!!!!!!!!!', 'correct the spelling of a the word: whatver!!!!!!!!!!']


## Footnote
We know that we loaded the model weights correctly because the model can generate coherent text; if we made even a small mistake, the mode would not be able to do that