In [4]:
import torch
from torch import nn as nn
from transformers import GPTNeoXForCausalLM

PYTHIA_MODELS = {
    "pythia-160m": {
        "llm_family": "pythia", "llm_cls": GPTNeoXForCausalLM, "hf_hub_path": "EleutherAI/pythia-160m-deduped"
    },
    "pythia-410m": {
        "llm_family": "pythia", "llm_cls": GPTNeoXForCausalLM, "hf_hub_path": "EleutherAI/pythia-410m-deduped"
    },
     "pythia-1b":{
        "llm_family": "pythia", "llm_cls": GPTNeoXForCausalLM, "hf_hub_path": "EleutherAI/pythia-1b-deduped"
    },
    "pythia-1p4b":{
        "llm_family": "pythia", "llm_cls": GPTNeoXForCausalLM, "hf_hub_path": "EleutherAI/pythia-1.4b-deduped"
    },
    "pythia-2p8b":{
        "llm_family": "pythia", "llm_cls": GPTNeoXForCausalLM, "hf_hub_path": "EleutherAI/pythia-2.8b-deduped"
    },
}

In [5]:
from transformers import AutoModelForCausalLM
import torch

# Iterate over each model in the PYTHIA_MODELS dictionary
for model_name, model_info in PYTHIA_MODELS.items():
    print(f"Loading model: {model_name}")
    
    # Load the model using the specified class and Hugging Face hub path
    model_cls = model_info['llm_cls']
    hf_hub_path = model_info['hf_hub_path']
    
    model = model_cls.from_pretrained(hf_hub_path)
    
    # Retrieve the full rank (d_model) from the model configuration
    d_model = model.config.hidden_size
    del model
    
    # Print the model name and the d_model value
    print(f"{model_name} has d_model (input/output dimension size): {d_model}")


Loading model: pythia-160m
pythia-160m has d_model (input/output dimension size): 768
Loading model: pythia-410m
pythia-410m has d_model (input/output dimension size): 1024
Loading model: pythia-1b
pythia-1b has d_model (input/output dimension size): 2048
Loading model: pythia-1p4b
pythia-1p4b has d_model (input/output dimension size): 2048
Loading model: pythia-2p8b
pythia-2p8b has d_model (input/output dimension size): 2560


In [6]:
model = model_cls.from_pretrained(hf_hub_path)
model.config

GPTNeoXConfig {
  "_name_or_path": "EleutherAI/pythia-2.8b-deduped",
  "architectures": [
    "GPTNeoXForCausalLM"
  ],
  "attention_bias": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.1,
  "eos_token_id": 0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_size": 2560,
  "initializer_range": 0.02,
  "intermediate_size": 10240,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neox",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "rope_scaling": null,
  "rotary_emb_base": 10000,
  "rotary_pct": 0.25,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.40.2",
  "use_cache": true,
  "use_parallel_residual": true,
  "vocab_size": 50304
}