In [1]:
import torch
from torch import nn as nn
from transformers import GPTNeoXForCausalLM

PYTHIA_MODELS = {
    "pythia-160m": {
        "llm_family": "pythia", "llm_cls": GPTNeoXForCausalLM, "hf_hub_path": "EleutherAI/pythia-160m-deduped"
    },
    "pythia-410m": {
        "llm_family": "pythia", "llm_cls": GPTNeoXForCausalLM, "hf_hub_path": "EleutherAI/pythia-410m-deduped"
    },
     "pythia-1b":{
        "llm_family": "pythia", "llm_cls": GPTNeoXForCausalLM, "hf_hub_path": "EleutherAI/pythia-1b-deduped"
    },
    "pythia-1p4b":{
        "llm_family": "pythia", "llm_cls": GPTNeoXForCausalLM, "hf_hub_path": "EleutherAI/pythia-1.4b-deduped"
    },
    "pythia-2p8b":{
        "llm_family": "pythia", "llm_cls": GPTNeoXForCausalLM, "hf_hub_path": "EleutherAI/pythia-2.8b-deduped"
    },
}

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoModelForCausalLM
import torch

# Iterate over each model in the PYTHIA_MODELS dictionary
for model_name, model_info in PYTHIA_MODELS.items():
    print(f"Loading model: {model_name}")
    
    # Load the model using the specified class and Hugging Face hub path
    model_cls = model_info['llm_cls']
    hf_hub_path = model_info['hf_hub_path']
    
    model = model_cls.from_pretrained(hf_hub_path)
    
    # Retrieve the full rank (d_model) from the model configuration
    d_model = model.config.hidden_size
    del model
    
    # Print the model name and the d_model value
    print(f"{model_name} has d_model (input/output dimension size): {d_model}")


Loading model: pythia-160m
pythia-160m has d_model (input/output dimension size): 768
Loading model: pythia-410m
pythia-410m has d_model (input/output dimension size): 1024
Loading model: pythia-1b
pythia-1b has d_model (input/output dimension size): 2048
Loading model: pythia-1p4b
pythia-1p4b has d_model (input/output dimension size): 2048
Loading model: pythia-2p8b
pythia-2p8b has d_model (input/output dimension size): 2560


In [3]:
model = model_cls.from_pretrained(hf_hub_path)
model

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 2560)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-31): 32 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=2560, out_features=7680, bias=True)
          (dense): Linear(in_features=2560, out_features=2560, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=2560, out_features=10240, bias=True)
          (dense_4h_to_h): Linear(in_features=10240, out_features=2560, bias=True)
