In [1]:
import torch
from transformers import AutoModelForCausalLM, GPT2Tokenizer
# transformers is a huggingface lib
# AutoModelForCausalLM means the models will have unembedd matrix attached to end
from torchinfo import summary

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load small pretrained GPT2 model and tokenizer
gpt2 = AutoModelForCausalLM.from_pretrained('gpt2')

# larger models
# gpt2 = AutoModelForCausalLM.from_pretrained('gpt2-medium')
# gpt2 = AutoModelForCausalLM.from_pretrained('gpt2-large')
# gpt2 = AutoModelForCausalLM.from_pretrained('gpt2-xl')


tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


In [3]:
# see whats in the model
gpt2

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [4]:
# checkk out one of the attention heads
gpt2.transformer.h[3]

GPT2Block(
  (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attn): GPT2Attention(
    (c_attn): Conv1D(nf=2304, nx=768)
    (c_proj): Conv1D(nf=768, nx=768)
    (attn_dropout): Dropout(p=0.1, inplace=False)
    (resid_dropout): Dropout(p=0.1, inplace=False)
  )
  (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (mlp): GPT2MLP(
    (c_fc): Conv1D(nf=3072, nx=768)
    (c_proj): Conv1D(nf=768, nx=3072)
    (act): NewGELUActivation()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [5]:
# print some para names
for name, mat in gpt2.named_parameters():
    print(f'{name:>40} is of size: {mat.shape}')

                  transformer.wte.weight is of size: torch.Size([50257, 768])
                  transformer.wpe.weight is of size: torch.Size([1024, 768])
             transformer.h.0.ln_1.weight is of size: torch.Size([768])
               transformer.h.0.ln_1.bias is of size: torch.Size([768])
      transformer.h.0.attn.c_attn.weight is of size: torch.Size([768, 2304])
        transformer.h.0.attn.c_attn.bias is of size: torch.Size([2304])
      transformer.h.0.attn.c_proj.weight is of size: torch.Size([768, 768])
        transformer.h.0.attn.c_proj.bias is of size: torch.Size([768])
             transformer.h.0.ln_2.weight is of size: torch.Size([768])
               transformer.h.0.ln_2.bias is of size: torch.Size([768])
         transformer.h.0.mlp.c_fc.weight is of size: torch.Size([768, 3072])
           transformer.h.0.mlp.c_fc.bias is of size: torch.Size([3072])
       transformer.h.0.mlp.c_proj.weight is of size: torch.Size([3072, 768])
         transformer.h.0.mlp.c_proj.bia

In [6]:
# some additional useful info
#gives meta parameters pf model
gpt2.config
# n_ctx is number of context aka seq_len

GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "dtype": "float32",
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.57.1",
  "use_cache": true,
  "vocab_size": 50257
}

sumamry of model and params

In [7]:
# need some data to pass through
x = torch.tensor(tokenizer.encode('Hello, how are you today?')).unsqueeze(0)
summary(gpt2, input_data=x,col_names = ['input_size','output_size', 'num_params'])

Layer (type:depth-idx)                             Input Shape               Output Shape              Param #
GPT2LMHeadModel                                    [1, 7]                    --                        --
├─GPT2Model: 1-1                                   [1, 7]                    --                        --
│    └─Embedding: 2-1                              [1, 7]                    [1, 7, 768]               38,597,376
│    └─Embedding: 2-2                              [1, 7]                    [1, 7, 768]               786,432
│    └─Dropout: 2-3                                [1, 7, 768]               [1, 7, 768]               --
│    └─ModuleList: 2-4                             --                        --                        --
│    │    └─GPT2Block: 3-1                         [1, 7, 768]               [1, 7, 768]               7,087,872
│    │    └─GPT2Block: 3-2                         [1, 7, 768]               [1, 7, 768]               7,087,872
│    │    └─GP