# Analysis of Phi-3.5-mini model



In [1]:
import transformers
import torch

model_path = "microsoft/Phi-3.5-mini-instruct"

phi_model = transformers.AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)

# print(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
# Phi Model Wrapper

class PhiModel(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
    
    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask).logits


In [4]:
# Prepare test inputs
test_inputs = tokenizer("Hello, how are you?", return_tensors="pt")
print(test_inputs)


{'input_ids': tensor([[15043, 29892,   920,   526,   366, 29973]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}


In [7]:
model = PhiModel(phi_model).eval()

output = model(**test_inputs)

In [8]:
output

tensor([[[19.6329, 19.6842, 23.8997,  ..., 23.4851, 23.4887, 23.4872],
         [35.2728, 39.5948, 40.0355,  ..., 34.6642, 34.6659, 34.6639],
         [34.8641, 36.0914, 34.7488,  ..., 31.1677, 31.1690, 31.1691],
         [34.6998, 35.8825, 37.9439,  ..., 30.6538, 30.6549, 30.6538],
         [36.7390, 37.6414, 40.1978,  ..., 31.1590, 31.1605, 31.1616],
         [39.4064, 46.0986, 46.8669,  ..., 35.0389, 35.0401, 35.0382]]],
       grad_fn=<UnsafeViewBackward0>)

In [18]:
# test forward pass
phi_output = phi_model.forward(**test_inputs)

In [20]:
phi_output[0]

tensor([[[19.6329, 19.6842, 23.8997,  ..., 23.4851, 23.4887, 23.4872],
         [35.2728, 39.5948, 40.0355,  ..., 34.6642, 34.6659, 34.6639],
         [34.8641, 36.0914, 34.7488,  ..., 31.1677, 31.1690, 31.1691],
         [34.6998, 35.8825, 37.9439,  ..., 30.6538, 30.6549, 30.6538],
         [36.7390, 37.6414, 40.1978,  ..., 31.1590, 31.1605, 31.1616],
         [39.4064, 46.0986, 46.8669,  ..., 35.0389, 35.0401, 35.0382]]],
       grad_fn=<UnsafeViewBackward0>)

| past_key_values: torch.Size([1, 32, 4, 96])

In [7]:
phi_output[0][:, -1, :]

tensor([[38.6764, 44.1411, 45.2279,  ..., 35.4041, 35.4046, 35.4034]],
       grad_fn=<SliceBackward0>)

In [10]:
input_ids = torch.zeros((1, 2), dtype=torch.int32)
attention_mask = torch.ones((1, 2), dtype=torch.int32)


In [11]:
traced_model = torch.jit.trace(model.eval(), (input_ids, attention_mask))

  if sequence_length != 1:
  if seq_len > self.original_max_position_embeddings:
  ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
  if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
  if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):


## Convert model to CoreML


In [14]:
import coremltools as ct
import numpy as np

query_length = ct.RangeDim(lower_bound=1, upper_bound=2048, default=1)

inputs = [
    ct.TensorType(name="inputIds", shape=(1, query_length), dtype=np.int32),
    ct.TensorType(name="attentionMask", shape=(1, query_length), dtype=np.int32),
]

outputs = [
    ct.TensorType(name="logits", dtype=np.float16),
]

Torch version 2.4.1+cu121 has not been tested with coremltools. You may run into unexpected errors. Torch 2.3.0 is the most recent version that has been tested.
Failed to load _MLModelProxy: No module named 'coremltools.libcoremlpython'


In [15]:
fp16_mlmodel = ct.convert(
    traced_model.eval(),
    inputs=inputs,
    outputs=outputs,
    source="pytorch",
    minimum_deployment_target=ct.target.iOS18,
    compute_precision=ct.precision.FLOAT32
)

Converting PyTorch Frontend ==> MIL Ops:   0%|          | 0/4529 [00:00<?, ? ops/s]Core ML embedding (gather) layer does not support any inputs besides the weights and indices. Those given will be ignored.
Saving value type of int64 into a builtin type of int32, might lose precision!
Converting PyTorch Frontend ==> MIL Ops:   5%|▍         | 223/4529 [00:00<00:01, 2222.04 ops/s]Saving value type of int64 into a builtin type of int32, might lose precision!
Saving value type of int64 into a builtin type of int32, might lose precision!
Converting PyTorch Frontend ==> MIL Ops:  11%|█         | 506/4529 [00:00<00:01, 2536.87 ops/s]Saving value type of int64 into a builtin type of int32, might lose precision!
Saving value type of int64 into a builtin type of int32, might lose precision!
Converting PyTorch Frontend ==> MIL Ops:  17%|█▋        | 787/4529 [00:00<00:01, 2649.87 ops/s]Saving value type of int64 into a builtin type of int32, might lose precision!
Saving value type of int64 into a b

In [16]:
fp16_mlmodel.save("phi-3.5-mini-instruct-fp32.mlpackage")