<a href="https://colab.research.google.com/github/sheneman/finetune/blob/main/tune/freeze.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# freeze.py
#
# An example of Parameter Efficient Fine Tuning
#
# Load GPT-2 from HuggingFace
# ----------------------------
#   1. Print the layers of the model
#   2. Freeze the first 3/4 transformer blocks of the model prior to fine-tuning
#   3. Print the layers of the model (again, now woth frozen transformer blocks)
#
# Luke Sheneman
# Research Computing and Data Services (RCDS)
# Institute for Interdisciplinary Data Sciences (IIDS)
#
# sheneman@uidaho.edu
# 2024
#

In [2]:
import torch
from transformers import GPT2LMHeadModel


In [3]:
# Load pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [4]:
params = model.parameters()

# Access the individual layers (modules) of the model
model_layers = list(model.children())

# let's print a condensed version of the model, starting with the first layer
first_layer = model_layers[0]
print("\n")
print("Condensed representation of GPT-2 Architecture:")
print("\n\n")
print(first_layer)
print("\n\n")



Condensed representation of GPT-2 Architecture:



GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)





In [5]:
# Print the layers of the model
print("\n\n*******************************")
print("ORIGINAL MODEL:")
print("NUMBER OF TRANSFORMER BLOCKS: ", len(model.transformer.h))
print("NUMBER OF TRAINABLE PARAMETERS: ", sum(p.numel() for p in params if p.requires_grad))
print("*******************************\n")
for idx, (name, param) in enumerate(model.named_parameters()):
	print(f"Layer {idx}: {name}, Frozen: {not param.requires_grad}")




*******************************
ORIGINAL MODEL:
NUMBER OF TRANSFORMER BLOCKS:  12
NUMBER OF TRAINABLE PARAMETERS:  124439808
*******************************

Layer 0: transformer.wte.weight, Frozen: False
Layer 1: transformer.wpe.weight, Frozen: False
Layer 2: transformer.h.0.ln_1.weight, Frozen: False
Layer 3: transformer.h.0.ln_1.bias, Frozen: False
Layer 4: transformer.h.0.attn.c_attn.weight, Frozen: False
Layer 5: transformer.h.0.attn.c_attn.bias, Frozen: False
Layer 6: transformer.h.0.attn.c_proj.weight, Frozen: False
Layer 7: transformer.h.0.attn.c_proj.bias, Frozen: False
Layer 8: transformer.h.0.ln_2.weight, Frozen: False
Layer 9: transformer.h.0.ln_2.bias, Frozen: False
Layer 10: transformer.h.0.mlp.c_fc.weight, Frozen: False
Layer 11: transformer.h.0.mlp.c_fc.bias, Frozen: False
Layer 12: transformer.h.0.mlp.c_proj.weight, Frozen: False
Layer 13: transformer.h.0.mlp.c_proj.bias, Frozen: False
Layer 14: transformer.h.1.ln_1.weight, Frozen: False
Layer 15: transformer.h.1.ln_

In [6]:
print("\n\n*******************************")
print("FREEZING MODEL FOR FINE-TUNING")
print("*******************************\n")


# Freeze the first 5/6 of the transformer blocks
num_layers = len(model.transformer.h)  # Total number of transformer blocks
num_layers_to_freeze = 5 * num_layers // 6  # Calculate 5/6 of the total

for layer in model.transformer.h[:num_layers_to_freeze]:
	for param in layer.parameters():
		param.requires_grad = False



*******************************
FREEZING MODEL FOR FINE-TUNING
*******************************



In [7]:
#
# Here, let's explicitly freeze the token embedding layer
#
# Find the "wte" layer
wte_layer = None
for layer_name, layer in model.named_modules():
	if "wte" in layer_name:
		wte_layer = layer
		break

# Check if the "wte" layer was found
if wte_layer is not None:
	# Freeze the "wte" layer by setting requires_grad to False for its parameters
	for param in wte_layer.parameters():
		param.requires_grad = False


In [8]:
params = model.parameters()

# Print which layers are frozen and which are not
print("\n\n*******************************")
print("FROZEN MODEL:")
print("NUMBER OF TRANSFORMER BLOCKS: ", len(model.transformer.h))
print("NUMBER OF TRAINABLE PARAMETERS: ", sum(p.numel() for p in params if p.requires_grad))
print("*******************************\n")
for idx, (name, param) in enumerate(model.named_parameters()):
	print(f"Layer {idx}: {name}, Frozen: {not param.requires_grad}")



*******************************
FROZEN MODEL:
NUMBER OF TRANSFORMER BLOCKS:  12
NUMBER OF TRAINABLE PARAMETERS:  14963712
*******************************

Layer 0: transformer.wte.weight, Frozen: True
Layer 1: transformer.wpe.weight, Frozen: False
Layer 2: transformer.h.0.ln_1.weight, Frozen: True
Layer 3: transformer.h.0.ln_1.bias, Frozen: True
Layer 4: transformer.h.0.attn.c_attn.weight, Frozen: True
Layer 5: transformer.h.0.attn.c_attn.bias, Frozen: True
Layer 6: transformer.h.0.attn.c_proj.weight, Frozen: True
Layer 7: transformer.h.0.attn.c_proj.bias, Frozen: True
Layer 8: transformer.h.0.ln_2.weight, Frozen: True
Layer 9: transformer.h.0.ln_2.bias, Frozen: True
Layer 10: transformer.h.0.mlp.c_fc.weight, Frozen: True
Layer 11: transformer.h.0.mlp.c_fc.bias, Frozen: True
Layer 12: transformer.h.0.mlp.c_proj.weight, Frozen: True
Layer 13: transformer.h.0.mlp.c_proj.bias, Frozen: True
Layer 14: transformer.h.1.ln_1.weight, Frozen: True
Layer 15: transformer.h.1.ln_1.bias, Frozen: T