In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import torch
import json


In [2]:
model_name = "meta-llama/Meta-Llama-3-8B"
token_s = "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"

In [3]:
model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=token_s)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=token_s)
config = AutoConfig.from_pretrained(model_name, use_auth_token=token_s)



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
encoded_input = tokenizer.encode("hello world!", add_special_tokens=False)
decoded_output = tokenizer.decode(encoded_input)
print(f"Encoded input: {encoded_input}")
print(f"Decoded output: {decoded_output}")

Encoded input: [15339, 1917, 0]
Decoded output: hello world!


In [6]:
state_dict = model.state_dict()

# Print the first 20 layer names
print("dict: ", json.dumps(list(state_dict.keys())[:20], indent=4))
print("config: ", config)

dict:  [
    "model.embed_tokens.weight",
    "model.layers.0.self_attn.q_proj.weight",
    "model.layers.0.self_attn.k_proj.weight",
    "model.layers.0.self_attn.v_proj.weight",
    "model.layers.0.self_attn.o_proj.weight",
    "model.layers.0.mlp.gate_proj.weight",
    "model.layers.0.mlp.up_proj.weight",
    "model.layers.0.mlp.down_proj.weight",
    "model.layers.0.input_layernorm.weight",
    "model.layers.0.post_attention_layernorm.weight",
    "model.layers.1.self_attn.q_proj.weight",
    "model.layers.1.self_attn.k_proj.weight",
    "model.layers.1.self_attn.v_proj.weight",
    "model.layers.1.self_attn.o_proj.weight",
    "model.layers.1.mlp.gate_proj.weight",
    "model.layers.1.mlp.up_proj.weight",
    "model.layers.1.mlp.down_proj.weight",
    "model.layers.1.input_layernorm.weight",
    "model.layers.1.post_attention_layernorm.weight",
    "model.layers.2.self_attn.q_proj.weight"
]
config:  LlamaConfig {
  "_name_or_path": "meta-llama/Meta-Llama-3-8B",
  "architectures": 

In [45]:
print(type(config))
print(config.num_attention_heads)

<class 'transformers.models.llama.configuration_llama.LlamaConfig'>
32


In [8]:
prompt = "the answer to the ultimate question of life, the universe, and everything is "
tokens = tokenizer.encode(prompt)
print(tokens)

[128000, 1820, 4320, 311, 279, 17139, 3488, 315, 2324, 11, 279, 15861, 11, 323, 4395, 374, 220]


In [9]:
tokens = torch.tensor(tokens)
prompt_split_as_tokens = tokenizer.decode([token.item() for token in tokens])
print(prompt_split_as_tokens)

<|begin_of_text|>the answer to the ultimate question of life, the universe, and everything is 


In [46]:
embedding_layer = torch.nn.Embedding(config.vocab_size, config.hidden_size)
embedding_layer.weight.data.copy_(model.state_dict()["model.embed_tokens.weight"])
token_embeddings_unnormalized = embedding_layer(tokens).to(torch.bfloat16)
print(token_embedding_unnormalized.shape)


torch.Size([17, 4096])


In [33]:
# ! pip install fairscale
from main import RMSNorm

In [47]:
token_embeddings = RMSNorm(dim=config.hidden_size)(token_embeddings_unnormalized)
token_embeddings.shape

torch.Size([17, 4096])

In [48]:
print(state_dict["model.layers.0.self_attn.q_proj.weight"].shape,
     state_dict["model.layers.0.self_attn.k_proj.weight"].shape,
     state_dict["model.layers.0.self_attn.v_proj.weight"].shape,
     state_dict["model.layers.0.self_attn.o_proj.weight"].shape)

torch.Size([4096, 4096]) torch.Size([1024, 4096]) torch.Size([1024, 4096]) torch.Size([4096, 4096])


In [50]:
q_layer0 = state_dict["model.layers.0.self_attn.q_proj.weight"]
head_dim = q_layer0.shape[0] // config.num_attention_heads
q_layer0 = q_layer0.view(config.num_attention_heads, head_dim, config.hidden_size)
q_layer0.shape


torch.Size([32, 128, 4096])

In [51]:
q_layer0_head0 = q_layer0[0]
q_layer0_head0.shape


torch.Size([128, 4096])

In [52]:
q_per_token = torch.matmul(token_embeddings, q_layer0_head0.T)
q_per_token.shape


torch.Size([17, 128])