##Install required tools

In [None]:
!pip install git+https://github.com/openai/whisper.git
!pip install onnx
!pip install onnx_tf
!git clone https://github.com/usefulsensors/openai-whisper.git
!git clone https://github.com/openai/whisper.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-1nc4rsn0
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-1nc4rsn0
  Resolved https://github.com/openai/whisper.git to commit f5bfe004eccc3837a0d198baf7602ec7bccffafd
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers>=4.19.0
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpeg-python==0.2.0
  Downloading ffmpeg_python-0.2.0-py3-none-any.whl (25 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
%%capture
!pip install optimum[onnxruntime] transformers git+https://github.com/openai/whisper.git

# Convert Whisper to ONNX

In [None]:
# -*- coding: utf-8 -*-
import warnings
warnings.filterwarnings("ignore")

from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
from transformers import (
    set_seed,
    AutoProcessor
)
from pathlib import Path
import os

SEED = 42

# Export vanilla & optimized onnx model
def export_vanilla_optimized_onnx(model_checkpoint):
    set_seed(SEED)
    processor = AutoProcessor.from_pretrained(model_checkpoint)

    # Vanilla
    model = ORTModelForSpeechSeq2Seq.from_pretrained(model_checkpoint, from_transformers=True, use_cache=True)
    onnx_path = Path(os.path.join("exported_onnx_models/", model_checkpoint))
    model.save_pretrained(onnx_path)
    processor.save_pretrained(onnx_path)


export_vanilla_optimized_onnx('openai/whisper-tiny')

Downloading:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/828 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/151M [00:00<?, ?B/s]

##Generate whisper encoder tflite(hybrid) model and run Inference

In [None]:
import whisper
import torch
import tensorflow as tf
import onnx
import numpy as np
import argparse
import os
import warnings
import tqdm
from onnx_tf.backend import prepare
from whisper.audio import load_audio, log_mel_spectrogram,pad_or_trim,N_FRAMES, SAMPLE_RATE
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

#load openai->whisper(pytorch)->tiny model
tiny_model = whisper.load_model("tiny")

#Export to onnx format
torch.onnx.export(tiny_model.encoder, torch.randn(1,80,3000).to(device), "./whisper-encoder.onnx")
onnx_model_path = './whisper-encoder.onnx'
tf_model_path = 'model_tf-encoder'

onnx_model = onnx.load(onnx_model_path)
tf_rep = prepare(onnx_model)
tf_rep.export_graph(tf_model_path)


saved_model_dir = 'model_tf-encoder'
tflite_model_path = 'whisper-encoder-hybrid.tflite'

def representative_dataset_random():
    for _ in range(100):
      data = np.random.rand(1, 80, 3000)
      yield [data.astype(np.float32)]

def representative_dataset():
    for _ in range(1):#Change this to 100 and provide 100 different audio files from known dataset
      mel_from_file = log_mel_spectrogram('/content/whisper/tests/jfk.flac')
      segment = pad_or_trim(mel_from_file, N_FRAMES)
      segment = tf.expand_dims(segment, 0)
      print(segment.shape)
      yield [segment]

# Convert to tflite(int8) model
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
converter.target_spec.supported_ops = [
  #tf.lite.OpsSet.TFLITE_BUILTINS_INT8, # enable TensorFlow Lite int8 ops.
  tf.lite.OpsSet.TFLITE_BUILTINS, # enable TensorFlow Lite ops.
  tf.lite.OpsSet.SELECT_TF_OPS # enable TensorFlow ops.
]
#converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8,tf.lite.OpsSet.SELECT_TF_OPS]
#converter.representative_dataset = representative_dataset
#converter.inference_input_type = tf.int8  # or tf.uint8
#converter.inference_output_type = tf.int8  # or tf.uint8
converter.inference_input_type = tf.float32
converter.inference_output_type = tf.float32
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()



# Save the model
with open(tflite_model_path, 'wb') as f:
    f.write(tflite_model)
import tensorflow as tf
import numpy as np
tflite_model_path = '/content/whisper-encoder-hybrid.tflite'

# Load the TFLite model and allocate tensors
interpreter_enc = tf.lite.Interpreter(model_path=tflite_model_path)
interpreter_enc.allocate_tensors()

print("== Input details ==")
print("name:", interpreter_enc.get_input_details()[0]['name'])
print("shape:", interpreter_enc.get_input_details()[0]['shape'])
print("type:", interpreter_enc.get_input_details()[0]['dtype'])

print("\nDUMP INPUT")
print(interpreter_enc.get_input_details()[0])

print("\n== Output details ==")
print("name:", interpreter_enc.get_output_details()[0]['name'])
print("shape:", interpreter_enc.get_output_details()[0]['shape'])
print("type:", interpreter_enc.get_output_details()[0]['dtype'])

print("\nDUMP OUTPUT")
print(interpreter_enc.get_output_details()[0])

# Get input and output tensors
input_details = interpreter_enc.get_input_details()
output_details = interpreter_enc.get_output_details()
output_tensor = interpreter_enc.get_output_details()[0]['index']

# Test the model with random data
input_shape = input_details[0]['shape']
mel_from_file = log_mel_spectrogram('/content/whisper/tests/jfk.flac')
input_tensor = pad_or_trim(mel_from_file, N_FRAMES)
input_tensor = tf.expand_dims(input_tensor, 0)

audio = whisper.load_audio('/content/whisper/tests/jfk.flac')
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio)
mel = np.expand_dims(mel,0)
#input_tensor = np.array(input_tensor-128, dtype=np.int8)
interpreter_enc.set_tensor(input_details[0]['index'], mel)

interpreter_enc.invoke()
print("Whisper Encoder Inference executed successfully\n")
encoder_output_data = interpreter_enc.get_tensor(output_tensor)
print(encoder_output_data.shape)
print(encoder_output_data)
np.savetxt("encoder_output.txt", encoder_output_data.reshape((3,-1)), fmt="%s", header=str(encoder_output_data.shape))

Using device: cpu


100%|██████████████████████████████████████| 72.1M/72.1M [00:00<00:00, 126MiB/s]


== Input details ==
name: serving_default_x.1:0
shape: [   1   80 3000]
type: <class 'numpy.float32'>

DUMP INPUT
{'name': 'serving_default_x.1:0', 'index': 0, 'shape': array([   1,   80, 3000], dtype=int32), 'shape_signature': array([   1,   80, 3000], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}

== Output details ==
name: PartitionedCall:0
shape: [   1 1500  384]
type: <class 'numpy.float32'>

DUMP OUTPUT
{'name': 'PartitionedCall:0', 'index': 557, 'shape': array([   1, 1500,  384], dtype=int32), 'shape_signature': array([   1, 1500,  384], dtype=int32), 'dtype': <class 'numpy.float32'>, 'quantization': (0.0, 0), 'quantization_parameters': {'scales': array([], dtype=float32), 'zero_points': array([], dtype=int32), 'quantized_dimension': 0}, 'sparsity_parameters': {}}
Whisper Encoder Inference execute

## Convert from pytorch to decoder onnx

In [None]:
import whisper
import torch

model = whisper.load_model("tiny")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch.onnx.export(model.decoder,
                  (torch.tensor([[50258, 50259, 50359, 50363]]).to(device), #sample input ids
                   torch.randn(1, 1500, 384).to(device)), #encoder outputs
                  "./decoder.onnx", #final onnx model name
                  opset_version=10, #opset 13 fails for me with unsupported squeeze sth
                  input_names=['tokens', 'hidden_states'], # the model's input names,
                  output_names=['output'], #self-set output node name
                  dynamic_axes={'tokens': {1: 'toks'}, # variable length axes, inputs ids, tokens are index=1 and we want that dimension
                                'output': {1: 'toks'}}) #variable output axes

##Generate decoder language tflite(hybrid) model

In [None]:
from onnx_tf.backend import prepare
import onnx

onnx_model_path = '/content/decoder.onnx'
tf_model_path = 'model_tf-decoder-language'

onnx_model = onnx.load(onnx_model_path)
tf_rep = prepare(onnx_model, dynamic_input=["serving_default_tokens"],dynamic_output=["PartitionedCall"])

tf_rep.export_graph(tf_model_path)
saved_model_dir = 'model_tf-decoder-language'
tflite_model_path = 'whisper-decoder-language-hybrid.tflite'

#Change from random representative dataset to real representative dataset
def representative_dataset_random():
    for _ in range(10):
      #input_tensor = np.random.rand(1,1500,384)
      input_tensor = encoder_output_data
      decoder_input_ids = torch.tensor([[50258, 50259, 50359, 50363]])
      decoder_input_ids = np.array(decoder_input_ids, dtype=np.int64)
      yield [decoder_input_ids, input_tensor.astype(np.float32)]
# Convert the model
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
converter.target_spec.supported_ops = [
 # tf.lite.OpsSet.TFLITE_BUILTINS_INT8,
  tf.lite.OpsSet.TFLITE_BUILTINS, # enable TensorFlow Lite ops.
  tf.lite.OpsSet.SELECT_TF_OPS # enable TensorFlow ops.
]
#converter.representative_dataset = representative_dataset_random
#converter.inference_input_type = tf.int8  # or tf.uint8
#converter.inference_output_type = tf.int8  # or tf.uint8
#converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8,tf.lite.OpsSet.SELECT_TF_OPS]
converter.inference_input_type = tf.float32
converter.inference_output_type = tf.float32
converter.optimizations = [tf.lite.Optimize.DEFAULT]
tflite_model = converter.convert()

# Save the model
with open(tflite_model_path, 'wb') as f:
    f.write(tflite_model)
tflite_model_path = '/content/whisper-decoder-language-hybrid.tflite'





##Run whisper decoder inference

In [None]:
import numpy as np
tflite_model_path='/content/whisper-decoder-language-hybrid.tflite'
#tflite_model_path='/content/whisper-decoder_main-int8.tflite'
print(tflite_model_path)

# Load the TFLite model and allocate tensors
interpreter = tf.lite.Interpreter(model_path=tflite_model_path)
interpreter.allocate_tensors()

decoder_input_ids = torch.tensor([50258, 50266, 50358, 50363])
decoder_input_ids = tf.expand_dims(decoder_input_ids, 0)
print(decoder_input_ids.shape)
print(encoder_output_data.shape)

input_tensor_1 = interpreter.get_input_details()[0]['index']
interpreter.set_tensor(input_tensor_1, encoder_output_data)

input_tensor_2 = interpreter.get_input_details()[1]['index']
interpreter.resize_tensor_input(input_tensor_2, decoder_input_ids.shape)
# Allocate memory for input and output tensors
interpreter.allocate_tensors()
interpreter.set_tensor(input_tensor_2, decoder_input_ids)
output_tensor = interpreter.get_output_details()[0]['index']
start_tokens = [50258, 50266, 50358, 50363] #<|startoftranscript|><|ja|><|translate|><|notimestamps|>
tokens = start_tokens
while(True):
    interpreter.invoke()
    output_data = interpreter.get_tensor(output_tensor)
    cleaned = np.argmax(output_data, axis=-1)
    last_token = cleaned[0,-1]
    print(last_token)
    tokens.append(last_token)
    new_value = tf.constant([last_token], dtype=tf.int64)
    new_value = tf.reshape(new_value, (1,1))
    decoder_input_ids = tf.concat([decoder_input_ids, new_value], axis=1)
    input_tensor_2 = interpreter.get_input_details()[1]['index']
    interpreter.resize_tensor_input(input_tensor_2, decoder_input_ids.shape)
    # Allocate memory for input and output tensors
    interpreter.allocate_tensors()
    interpreter.set_tensor(input_tensor_2, decoder_input_ids)
    if last_token == 50257:
      break

from transformers import (
    AutoTokenizer
)
model_id = "openai/whisper-tiny"
tokenizer = AutoTokenizer.from_pretrained(model_id)
skip_special_tokens=True
tokenizer.batch_decode(np.expand_dims(tokens, axis=0), skip_special_tokens=skip_special_tokens)[0]




/content/whisper-decoder-language-hybrid.tflite
(1, 4)
(1, 1500, 384)
400
370
452
7177
6280
1029
406
437
428
1941
393
360
337
291
11
1029
437
291
393
360
337
428
1941
13
50257


' And so my fellow Americans ask not what your country can do for you, ask what you can do for your country.'