In [18]:
from model import Translator
from dataset import TextDataset
import torch
import tqdm
from tokenizers import Tokenizer
import os
import coremltools as ct
import numpy as np

In [19]:

print(f"Using PyTorch version {torch.__version__}")

# use gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device {device}")

# use tensor cores
torch.set_float32_matmul_precision('high')

# use flash attention
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(True)

Using PyTorch version 2.3.0+cu121
Using device cuda


In [20]:
model = Translator(engVocabSize=804, hilliVocabSize=292, embed_size=256,
                   num_encoder_blocks=5, num_decoder_blocks=5, num_heads=8, dropout=0.1, pad_char=2)
model.load_state_dict(torch.load("../models/model.pt").state_dict())
model.eval()

Translator(
  (engEmbedding): Embedding(804, 256)
  (hilliEmbedding): Embedding(292, 256)
  (decoder_block): ModuleList(
    (0-4): 5 x Decoder(
      (feed_forward): Sequential(
        (0): Dropout(p=0.1, inplace=False)
        (1): Linear(in_features=256, out_features=512, bias=False)
        (2): ReLU()
        (3): Linear(in_features=512, out_features=256, bias=False)
        (4): ReLU()
      )
      (layernorm): RMSNorm()
      (layernorm2): RMSNorm()
      (layernorm3): RMSNorm()
      (MHA): SelfAttention(
        (c_attn): Linear(in_features=256, out_features=768, bias=False)
        (c_proj): Linear(in_features=256, out_features=256, bias=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (CA): CrossAttention(
        (query_attn): Linear(in_features=256, out_features=256, bias=False)
        (key_attn): Linear(in_features=256, out_features=256, bias=False)
        (value_attn): Linear(in_features=256, out_features=256, bias=False)
        (c_proj): 

In [21]:
model = torch.jit.trace(model, (torch.randint(0, 804, (1, 100,)), torch.randint(0, 292, (1, 100,))))

  assert query_batch == key_batch == value_batch
  assert query_channels == key_channels == value_channels


In [22]:
model = torch.jit.script(model)

In [26]:
mlmodel = ct.convert(model, 
                     inputs=[ct.TensorType(name="x", shape=(1, 100), dtype=np.int32), ct.TensorType(name="originalText", shape=(1, 100), dtype=np.int32)],
                     convert_to="neuralnetwork",
                     )

Converting PyTorch Frontend ==> MIL Ops:   0%|          | 0/2361 [00:00<?, ? ops/s]Saving value type of int64 into a builtin type of int32, might lose precision!
Saving value type of int64 into a builtin type of int32, might lose precision!
Saving value type of int64 into a builtin type of int32, might lose precision!
Saving value type of int64 into a builtin type of int32, might lose precision!
Converting PyTorch Frontend ==> MIL Ops:   6%|▋         | 152/2361 [00:00<00:01, 1518.65 ops/s]Saving value type of int64 into a builtin type of int32, might lose precision!
Saving value type of int64 into a builtin type of int32, might lose precision!
Saving value type of int64 into a builtin type of int32, might lose precision!
Saving value type of int64 into a builtin type of int32, might lose precision!
Saving value type of int64 into a builtin type of int32, might lose precision!
Converting PyTorch Frontend ==> MIL Ops:  17%|█▋        | 391/2361 [00:00<00:00, 2026.67 ops/s]Saving value typ

In [27]:
mlmodel.save("model.mlmodel")

In [15]:
from coremltools.optimize.coreml import (
    OpMagnitudePrunerConfig,
    OptimizationConfig,
    prune_weights,
)

op_config = OpMagnitudePrunerConfig(
    target_sparsity=0.6,
    weight_threshold=1024,
)
config = OptimizationConfig(global_config=op_config)
model_compressed = prune_weights(mlmodel, config=config)

Running compression pass prune_weights: 100%|██████████| 95/95 [00:01<00:00, 74.91 ops/s] 
Running MIL frontend_milinternal pipeline: 0 passes [00:00, ? passes/s]
Running MIL default pipeline: 100%|██████████| 76/76 [00:01<00:00, 39.79 passes/s]
Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 102.54 passes/s]


In [16]:
from coremltools.optimize.coreml import (
    OpPalettizerConfig,
    OptimizationConfig,
    palettize_weights,
)

op_config = OpPalettizerConfig(mode="kmeans", nbits=8, weight_threshold=512)
config = OptimizationConfig(global_config=op_config)
compressed_6_bit_model = palettize_weights(model_compressed, config=config)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
Running compression pass palettize_weights: 100%|██████████| 160/160 [00:00<00:00, 412.06 ops/s]
Running MIL frontend_milinternal pipeline: 0 passes [00:00, ? passes/s]
Running MIL default pipeline: 100%|██████████| 76/76 [00:01<00:00, 39.55 passes/s]
Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 72.12 passes/s]


In [17]:
compressed_6_bit_model .save("model.mlpackage")