In [1]:
from typing import (
    TYPE_CHECKING,
    Any,
    Callable,
    Dict,
    Iterator,
    List,
    Optional,
    Sequence,
    Tuple,
    Union,
)
from tvm.relax.frontend import nn
from tvm.relax.frontend.nn import Tensor, op
from tvm.runtime import Device, NDArray, load_static_library, ndarray
from mlc_chat.model.whisper_tiny.whisper_model import WhisperConfig, WhisperForConditionalGeneration



In [2]:
!python -c "import tvm; print('tvm sucessfully installed')"
!python -c "import mlc_chat; print('mlc_chat sucessfully installed')"

tvm sucessfully installed
mlc_chat sucessfully installed


For compiling, weight, config generation


In [3]:
import tvm
device_str = "vulkan"
device = tvm.runtime.device(device_str)
if device.exist:
    print(f"{device_str} device found")
else:
    print(f"{device_str} device not found")    

vulkan device found


In [4]:
import mlc_chat.cli.convert_weight as cv
import mlc_chat.cli.compile as c
import mlc_chat.cli.gen_config as gencfg
import mlc_chat.cli.check_device as cdev

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
cv.main(["--model-type", "whisper-tiny", "../dist/models/whisper-tiny/", "--quantization", "q0f32", "-o", "../dist/libs/whisper-tiny/"])

[1mWeight conversion with arguments:[0m
  [1m--config[0m          ../dist/models/whisper-tiny/config.json
  [1m--quantization[0m    NoQuantize(name='q0f32', kind='no-quant', model_dtype='float32')
  [1m--model-type[0m      whisper-tiny
  [1m--device[0m          vulkan:0
  [1m--source[0m          ../dist/models/whisper-tiny/pytorch_model.bin
  [1m--source-format[0m   huggingface-torch
  [1m--output[0m          ../dist/libs/whisper-tiny


100%|██████████| 168/168 [00:00<00:00, 286.34it/s]


Start storing to cache ../dist/libs/whisper-tiny
[0168/0168] saving model.decoder.layer_norm.bias                        
All finished, 4 total shards committed, record saved to ../dist/libs/whisper-tiny/ndarray-cache.json
Also saved a bf16 record to ../dist/libs/whisper-tiny/ndarray-cache-b16.json


In [6]:
gencfg.main(["--model-type", "whisper-tiny", "../dist/models/whisper-tiny/", "--quantization", "q0f32", "--conv-template", "whisper",  "--output", "../dist/libs/whisper-tiny/"])

In [7]:
DEBUG_DUMP = "../dist/libs/whisper-tiny/debug_dump"

for vulkan

In [8]:
c.main(["--model-type", "whisper-tiny", "../dist/libs/whisper-tiny/","--quantization", "q0f32", "--device", "vulkan",  "--output", "../dist/libs/whisper-tiny/whisper-tiny.so" ,"--debug-dump", DEBUG_DUMP])

[1mCompiling with arguments:[0m
  [1m--config[0m          WhisperConfig(vocab_size=51865, num_mel_bins=80, encoder_layers=4, encoder_attention_heads=6, decoder_layers=4, decoder_attention_heads=6, decoder_ffn_dim=1536, encoder_ffn_dim=1536, d_model=384, max_source_positions=1500, max_target_positions=448, pad_token_id=50257, context_window_size=448, prefill_chunk_size=448, tensor_parallel_shards=1, kwargs={'model_type': 'whisper-tiny', 'quantization': 'q0f32', 'model_config': {'vocab_size': 51865, 'num_mel_bins': 80, 'encoder_layers': 4, 'encoder_attention_heads': 6, 'decoder_layers': 4, 'decoder_attention_heads': 6, 'decoder_ffn_dim': 1536, 'encoder_ffn_dim': 1536, 'd_model': 384, 'max_source_positions': 1500, 'max_target_positions': 448, 'pad_token_id': 50257, 'context_window_size': 448, 'prefill_chunk_size': 448, 'tensor_parallel_shards': 1}, 'sliding_window_size': -1, 'attention_sink_size': -1, 'mean_gen_len': 128, 'max_gen_len': 512, 'shift_fill_factor': 0.3, 'temperature': 0.

for android

In [None]:
#c.main(["--model-type", "whisper-tiny", "/home/munusairam/softwares/programs/python_projects/whisper-tiny/","--quantization", "q4f32_1", "--device", "android",  "--output", "/home/munusairam/softwares/programs/python_projects/whisper-tiny/whisper_tiny_q4f32_1_android.tar" ,"--debug-dump", DEBUG_DUMP])

In [None]:
# 1. gen_config: generate mlc-chat-config.json and process tokenizers
#!mlc_chat gen_config ./dist/models/gpt2 --quantization q4f16_1 --conv-template gpt2 
#    -o dist/gpt2-q4f16_1-MLC/
    

# 2. compile: compile model library with specification in mlc-chat-config.json
#!mlc_chat compile ./dist/gpt2-q4f16_1-MLC/mlc-chat-config.json \
#    --device cuda -o dist/gpt2-q4f16_1-MLC/gpt2-q4f16_1-cuda.so