In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1_5", torch_dtype="auto")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")

for key in model.state_dict().keys():
    print(key)

    print()



model.embed_tokens.weight

model.layers.0.self_attn.q_proj.weight

model.layers.0.self_attn.q_proj.bias

model.layers.0.self_attn.k_proj.weight

model.layers.0.self_attn.k_proj.bias

model.layers.0.self_attn.v_proj.weight

model.layers.0.self_attn.v_proj.bias

model.layers.0.self_attn.dense.weight

model.layers.0.self_attn.dense.bias

model.layers.0.mlp.fc1.weight

model.layers.0.mlp.fc1.bias

model.layers.0.mlp.fc2.weight

model.layers.0.mlp.fc2.bias

model.layers.0.input_layernorm.weight

model.layers.0.input_layernorm.bias

model.layers.1.self_attn.q_proj.weight

model.layers.1.self_attn.q_proj.bias

model.layers.1.self_attn.k_proj.weight

model.layers.1.self_attn.k_proj.bias

model.layers.1.self_attn.v_proj.weight

model.layers.1.self_attn.v_proj.bias

model.layers.1.self_attn.dense.weight

model.layers.1.self_attn.dense.bias

model.layers.1.mlp.fc1.weight

model.layers.1.mlp.fc1.bias

model.layers.1.mlp.fc2.weight

model.layers.1.mlp.fc2.bias

model.layers.1.input_layernorm.weight

# Loading the Config for the model

In [2]:
from transformers import PretrainedConfig

config = PretrainedConfig.from_pretrained("microsoft/phi-1_5")
config

You are using a model of type phi to instantiate a model of type . This is not supported for all configurations of models and can yield errors.


PretrainedConfig {
  "architectures": [
    "PhiForCausalLM"
  ],
  "attention_dropout": 0.0,
  "embd_pdrop": 0.0,
  "hidden_act": "gelu_new",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 8192,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "num_attention_heads": 32,
  "num_hidden_layers": 24,
  "num_key_value_heads": null,
  "partial_rotary_factor": 0.5,
  "qk_layernorm": false,
  "resid_pdrop": 0.0,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.53.1",
  "use_cache": true,
  "vocab_size": 51200
}

# Input & Tokenization

In [3]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
text = "Hello, do you know about reimann zeta function?"

# Tokenize the text
tokenized_text = tokenizer(text, return_tensors="pt")
tokens, attention_mask = tokenized_text["input_ids"], tokenized_text["attention_mask"]

print("Tokens:", tokens)
print("Attention mask:", attention_mask)

Tokens: tensor([[15496,    11,   466,   345,   760,   546, 21123,  1236,  1976, 17167,
          2163,    30]])
Attention mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


# Embedding layer

In [5]:
import torch
import torch.nn as nn

class Embedding(nn.Module):
    """Token embedding with dropout."""

    def __init__(self, config: PretrainedConfig) -> None:
        super().__init__()

        self.wte = nn.Embedding(config.vocab_size, config.hidden_size) # Token embedding
        # Loading the token embedding weights from the model    
        self.wte.weight.data = model.model.embed_tokens.weight.data.clone()
        self.drop = nn.Dropout(config.embd_pdrop)

    def forward(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
        input_shape = input_ids.size()
        input_ids = input_ids.view(-1, input_shape[-1])

        hidden_states = self.wte(input_ids)
        hidden_states = self.drop(hidden_states)

        return hidden_states

In [6]:
model.model.embed_tokens.weight.data

tensor([[ 9.7046e-03, -1.5488e-02,  6.0272e-02,  ...,  9.5520e-03,
         -5.4169e-02, -5.8174e-03],
        [ 2.4323e-02,  5.4321e-02,  1.7776e-02,  ...,  2.5421e-02,
         -4.3854e-02,  3.9612e-02],
        [-4.1565e-02,  3.6987e-02, -1.5976e-02,  ...,  4.7394e-02,
         -1.6113e-02,  4.3716e-03],
        ...,
        [-1.5259e-05,  3.0160e-05, -1.6034e-05,  ..., -1.9729e-05,
         -1.3590e-05,  9.2745e-05],
        [-8.0466e-06, -2.6107e-05, -5.1260e-05,  ...,  4.0054e-05,
          4.9233e-05, -1.6689e-05],
        [ 3.2783e-06, -1.7822e-05,  2.4676e-05,  ..., -3.4511e-05,
         -2.0921e-05,  1.6928e-05]], dtype=torch.float16)

In [7]:
embedding = Embedding(config)
token_embed = embedding(tokens)
print(token_embed.shape)

torch.Size([1, 12, 2048])


# Scaled Dot product attention

In [None]:
def scaled_dot_product_attention(q, k, v, mask=None):
    

torch.Size([1, 12, 2048])