In [None]:
from transformers import GPT2Config, GPT2LMHeadModel

# Define the configuration for GPT-2
config = GPT2Config(
    vocab_size=50257,        # The size of the vocabulary
    n_positions=1024,        # The maximum sequence length
    n_ctx=1024,              # The context size
    n_embd=768,              # Dimensionality of the embeddings
    n_layer=12,              # Number of hidden layers (transformer blocks)
    n_head=12,               # Number of attention heads
    intermediate_size=3072,  # Dimensionality of the feed-forward layer
    activation_function='gelu_new',  # Activation function (GELU)
    initializer_range=0.02,  # Weight initialization range
)

# Construct a GPT-2 model from the configuration
model = GPT2LMHeadModel(config)

# Count the number of parameters
total_params = sum(p.numel() for p in model.parameters())

print(f"Total parameters: {total_params}")
