## 1. Download GPT2

In [1]:
import os
import urllib.request

from tqdm import tqdm

In [2]:
model_size = "124M"
models_dir = "intermediates/gpt2"
sample_file = "hparams.json"

### 1.1. Prep Source

In [None]:
base_url = "https://openaipublic.blob.core.windows.net/gpt-2/models"
sample_url = os.path.join(base_url, model_size, sample_file)
sample_url

### 1.2. Prep Destination

In [3]:
model_dir = os.path.join(models_dir, model_size)
model_dir
os.makedirs(model_dir, exist_ok=True)
sample_destination = os.path.join(model_dir, sample_file)

### 1.3. Sample Download

In [None]:
with urllib.request.urlopen(sample_url) as response:
    file_size = int(response.headers.get("Content-Length", 0))
    print(file_size)

    # Check if file exists and has the same size
    if os.path.exists(sample_destination):
            file_size_local = os.path.getsize(sample_destination)
            if file_size == file_size_local:
                print(f"File already exists and is up-to-date: {sample_destination}")
                # return

    # Define the block size for reading the file
    block_size = 1024  # 1 Kilobyte

    progress_bar_description = os.path.basename(sample_url)  # Extract filename from URL
    with tqdm(total=file_size, unit="iB", unit_scale=True, desc=progress_bar_description) as progress_bar:
            # Open the destination file in binary write mode
            with open(sample_destination, "wb") as file:
                # Read the file in chunks and write to destination
                while True:
                    chunk = response.read(block_size)
                    if not chunk:
                        break
                    file.write(chunk)
                    progress_bar.update(len(chunk))  # Update progress bar

### 1.4. Complete Download

In [7]:
# Update sys path for imports to work
import sys
# print(f"Before: {"\n".join(sys.path)}")  ## Optional print to check
sys.path.append("../../../LLMFromScratch")
# print(f"After: {"\n".join(sys.path)}")  ## Optional print to check

In [8]:
from M3_weightloading.gpt_download import download_file

In [9]:
filenames = [
        "checkpoint", "encoder.json", "hparams.json",
        "model.ckpt.data-00000-of-00001", "model.ckpt.index",
        "model.ckpt.meta", "vocab.bpe"
    ]

In [None]:
for filename in filenames:
        source_file_url = os.path.join(base_url, model_size, filename)
        destination_file_path = os.path.join(model_dir, filename)
        download_file(source_file_url, destination_file_path)

## 2. Load from TensorFlow checkpoint
Since OpenAI used TensorFlow, we will have to install and use TensorFlow for loading the weights;

In [11]:
import json
import tensorflow as tf

In [None]:
print(model_dir)
tf_ckpt_path = tf.train.latest_checkpoint(model_dir)
print(tf_ckpt_path)

### 2.1. Load Settings

In [None]:
settings = json.load(open(os.path.join(model_dir, "hparams.json")))
settings

### 2.2 Load Params

In [14]:
import numpy as np

In [None]:
params = {"blocks": [{} for _ in range(settings["n_layer"])]}
params

In [None]:
tf.train.list_variables(tf_ckpt_path)

#### 2.2.1 Load Sample Layer params

In [17]:
name = "model/h0/attn/c_attn/w"

In [None]:
variable_value = np.squeeze(tf.train.load_variable(tf_ckpt_path, name))  # np.squeeze removes dim with len=1
variable_value.shape

In [None]:
variable_name_parts = name.split("/")[1:]  # Skip the 'model/' prefix
variable_name_parts

In [20]:
layer_number = int(variable_name_parts[0][1:])  # "0" of "h0"
target_dict = params["blocks"][layer_number]  # Pointer to element in the param dict

In [21]:
for key in variable_name_parts[1:-1]:  # Go from 2nd to last but one element in the variable name and create if key does not exist; Move the pointer to new key
    target_dict = target_dict.setdefault(key, {})

In [22]:
last_key = variable_name_parts[-1]  # Get last key name and assign value
target_dict[last_key] = variable_value

In [None]:
params

#### 2.2.2 Load Sample Non-Layer params

In [None]:
# Check all variables in checkpoint that do not belong to the layers
non_layer_variables = [
    var_name for var_name, _ in tf.train.list_variables(tf_ckpt_path)
    if not var_name.startswith("model/h")
]
print(non_layer_variables)

In [25]:
name = "model/ln_f/b"

In [None]:
variable_value = np.squeeze(tf.train.load_variable(tf_ckpt_path, name))  # np.squeeze removes dim with len=1
variable_value.shape

In [None]:
variable_name_parts = name.split("/")[1:]  # Skip the 'model/' prefix
variable_name_parts

In [None]:
target_dict = params
target_dict

In [None]:
variable_name_parts[1:-1]

In [None]:
destination_key = variable_name_parts[-1]
destination_key

In [31]:
target_dict[destination_key] = variable_value  #  Note: "ln_f" in variable name is dropped

In [None]:
target_dict

#### 2.2.3 Load all params using utility

In [33]:
# Update sys path for imports to work
import sys
# print(f"Before: {"\n".join(sys.path)}")  ## Optional print to check
sys.path.append("../../../LLMFromScratch")
# print(f"After: {"\n".join(sys.path)}")  ## Optional print to check

In [34]:
from M3_weightloading.gpt_download import load_gpt2_params_from_tf_ckpt

In [35]:
complete_params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings["n_layer"])

In [None]:
complete_params.keys()

In [None]:
complete_params["wpe"].shape, complete_params['wte'].shape

## 3. Test E2E Utility Function to download and load GPT2 weights

In [1]:
# Update sys path for imports to work
import sys
# print(f"Before: {"\n".join(sys.path)}")  ## Optional print to check
sys.path.append("../../../LLMFromScratch")
# print(f"After: {"\n".join(sys.path)}")  ## Optional print to check

In [2]:
from M3_weightloading.gpt_download import download_and_load_gpt2_params

In [3]:
model_size = "124M"
destination_dir = "intermediates/gpt2"

In [5]:
test_settings, test_params = download_and_load_gpt2_params(model_size, destination_dir)

File already exists and is up-to-date: intermediates/gpt2/124M/checkpoint
File already exists and is up-to-date: intermediates/gpt2/124M/encoder.json
File already exists and is up-to-date: intermediates/gpt2/124M/hparams.json
File already exists and is up-to-date: intermediates/gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: intermediates/gpt2/124M/model.ckpt.index
File already exists and is up-to-date: intermediates/gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: intermediates/gpt2/124M/vocab.bpe


In [6]:
test_settings

{'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}

In [7]:
test_params.keys()

dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])