In [1]:
from transformers import AutoTokenizer, OPTForCausalLM
import coremltools as ct
import torch

device = torch.device('cpu')

Torch version 2.5.0.dev20240809 has not been tested with coremltools. You may run into unexpected errors. Torch 2.4.0 is the most recent version that has been tested.


## Download and save the tokenizer

In [2]:
model_name = "facebook/galactica-1.3b"
tokenizer = AutoTokenizer.from_pretrained("facebook/galactica-1.3b", return_tensors="pt")
# Downloading the tokenizer
tokenizer.save_pretrained(f"cache/tokenizer/{model_name}")



('cache/tokenizer/facebook/galactica-1.3b/tokenizer_config.json',
 'cache/tokenizer/facebook/galactica-1.3b/special_tokens_map.json',
 'cache/tokenizer/facebook/galactica-1.3b/tokenizer.json')

### Sample input to trace the model

In [3]:
input_text = "Breast Cancer is"
inputs = tokenizer(input_text, return_tensors="pt")
input_ids = inputs["input_ids"].to(device)

## Download and save the model

In [18]:
model = OPTForCausalLM.from_pretrained("facebook/galactica-1.3b", device_map="mps", torchscript=True)
# save the model
model.to(device)
model.eval()
model.config.return_dict=True
model.save_pretrained(f"cache/model/{model_name}")

In [20]:
# Trace the model
with torch.no_grad():
    traced_model = torch.jit.trace(model, input_ids)

# different way to convert the model

# Prepare example input
# input_ids = torch.randint(0, model.config.vocab_size, (1, 512))

# Export the model to ONNX
# torch.onnx.export(
#     model,
#     (input_ids,),
#     "model.onnx",
#     input_names=['input_ids'],
#     output_names=['output'],
#     dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence_length'}},
#     opset_version=11,
# )

# torch.export.export()

  if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
  if past_key_values_length > 0:
  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if attention_mask.size() != (bsz, 1, tgt_len, src_len):
  attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):


### Convert the model

In [None]:
mlmodel = ct.convert(
    traced_model,
    inputs=[ct.TensorType(shape=input_ids.shape, dtype=input_ids.dtype)],
    compute_units=ct.ComputeUnit.CPU_ONLY  # or .CPU_AND_NE if you have a device with Neural Engine
)

### Quantization

In [21]:
mlmodel_fp16 = ct.convert(
    traced_model,
    # model,
    # "model.onnx",
    source="auto",
    inputs=[ct.TensorType(shape=input_ids.shape)],
    compute_precision=ct.precision.FLOAT16,
    convert_to="mlprogram",  # Use ML Program format for advanced features
)

Tuple detected at graph output. This will be flattened in the converted model.
Converting PyTorch Frontend ==> MIL Ops:   0%|          | 0/1967 [00:00<?, ? ops/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Core ML embedding (gather) layer does not support any inputs besides the weights and indices. Those given will be ignored.
Saving value type of int64 into a builtin type of int32, might lose precision!
Saving value type of int64 into a builtin type of int32, might lose precision!
Converting PyTorch Frontend ==> MIL Ops:  99%|█████████▊| 1940/1967 [00:00<00:00, 8524.26 ops/s]
Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 60.89 passes/s]
Running MIL default pipeline: 100%|██████████| 89/89 [00:25<00:00,  3.48 passes/s]
Runn

Although I can  8 bit quantization It will take significant time and consideration for every layer

In [None]:
# Quantization configuration
# from coremltools.converters.mil.mil.passes.quantization_passes import OpSelector
#
# config = ct.QuantizationConfig(
#     conversion_type=ct.conversion_type.DYNAMIC,
#     op_selector=OpSelector(supported_ops=["linear"]),  # Quantize supported layers
# )
#
# mlmodel_int8 = ct.convert(
#     traced_model,
#     inputs=[ct.TensorType(shape=input_ids.shape)],
#     convert_to="mlprogram",
#     quantization_config=config,
# )

In [23]:
# Save the Quantized model:
mlmodel_fp16.save("cache/model/quantized/Galactica_1_3B_fp16.mlpackage")

# Pruning
Reducing the model size with pruning even more

In [24]:
import torch.nn.utils.prune as prune

# Define pruning amount (e.g., 30%)
prune_amount = 0.3

# Prune weights in linear layers
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        prune.ln_structured(module, name='weight', amount=prune_amount, n=2, dim=0)

In [25]:
# remove pruning re-parameterization
for name, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        prune.remove(module, 'weight')

In [26]:
# Re-retrracing the model
with torch.no_grad():
    traced_model_pruned = torch.jit.trace(model, input_ids)

In [27]:
# convert again to coreml with quantization:
mlmodel_pruned_fp16 = ct.convert(
    traced_model_pruned,
    inputs=[ct.TensorType(shape=input_ids.shape)],
    compute_precision=ct.precision.FLOAT16,
    convert_to="mlprogram",
)
mlmodel_pruned_fp16.save("cache/model/pruned/Galactica_1_3B_pruned_fp16.mlpackage")

Tuple detected at graph output. This will be flattened in the converted model.
Converting PyTorch Frontend ==> MIL Ops:   0%|          | 0/1967 [00:00<?, ? ops/s]Core ML embedding (gather) layer does not support any inputs besides the weights and indices. Those given will be ignored.
Saving value type of int64 into a builtin type of int32, might lose precision!
Saving value type of int64 into a builtin type of int32, might lose precision!
Converting PyTorch Frontend ==> MIL Ops:  99%|█████████▊| 1940/1967 [00:00<00:00, 8374.85 ops/s]
Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 69.88 passes/s]
Running MIL default pipeline: 100%|██████████| 89/89 [00:26<00:00,  3.38 passes/s]
Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:02<00:00,  5.95 passes/s]


In [11]:
# Set device to CPU
device = "mps"

# Generate text on CPU
input_text = "Breast Cancer is"
# Tokenize input text and get attention mask
tokenizer_output = tokenizer(
    input_text,
    return_tensors="pt",
    return_attention_mask=True  # This is True by default but being explicit helps
)
input_ids = tokenizer_output["input_ids"].to(device)
attention_mask = tokenizer_output["attention_mask"].to(device)


# Control the maximum number of tokens in the generated text

# Generate text
outputs = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=100
)

# Decode generated output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

Breast Cancer is the most common cancer in women worldwide. The incidence of breast cancer is increasing in developing countries. The aim of this study was to evaluate the effect of the combination of tamoxifen and raloxifene on the expression of estrogen receptor (ER), progesterone receptor (PR), and human epidermal growth factor receptor 2 (HER2) in breast cancer.

