### Loading the Necessary Github Repos
* <b> Please double check if git repos are cloned properly, otherwise notebook will run into multiple issues.</b>

In [None]:
!pip install onnx
!pip install onnx_tf
!git clone https://github.com/usefulsensors/openai-whisper.git
!git clone https://github.com/openai/whisper.git

In [None]:
%%bash
cp whisper.patch whisper/
cd whisper/
patch -p1 < ./whisper.patch

### Installing Necessary Libraries

In [None]:
!pip3 install transformers==4.29.2 
!pip3 install safetensors==0.3.0
!pip3 install pyyaml==5.3
!pip3 install numpy==1.22.2
!pip3 install torchvision==0.15.2
!pip3 install packaging==21.3
!pip3 install datasets

In [None]:
import whisper
import torch
import tensorflow as tf
import onnx
import numpy as np
import argparse
import os
import warnings
import tqdm
#from onnx_tf.backend import prepare
from whisper.whisper import load_model
from whisper.whisper.audio import load_audio, log_mel_spectrogram,pad_or_trim,N_FRAMES, SAMPLE_RATE
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

#load openai->whisper(pytorch)->tiny model
tiny_model = load_model("tiny")

#Export to onnx format
torch.onnx.export(tiny_model.encoder,torch.randn(1,80,3000).to(device), "./whisper_encoder.onnx",opset_version=14)

### Getting the Encoder ONNX Model

### IDEA
**Basic Idea will be**
- Encoder will run as dlc
- Decoder will run as tflite Model
- The output generated from decoder will be added with output from previous decoder instance and then multiplied with the encoder result

![modelArchitecture](image-assets/whisper_model_Architecture.png)



#### Normal Model Inference

In [None]:
!pip3 install torch==1.8.1 
!pip3 install onnxruntime
!pip3 install tensorflow==2.10.1
!pip3 install tflite==2.3.0
!pip3 install soundfile
!pip3 install librosa
!pip3 install numpy==1.22
#!pip3 install numpy==1.18.5

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
import torch

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")


# load dummy dataset and read audio files
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = ds[0]["audio"]
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt") 
# generate token ids
predicted_ids = model.generate(**input_features,decoder_input_ids=torch.tensor([[50258, 50259, 50359, 50363]]))
# decode token ids to text

transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(transcription)

#### Getting The Special Token Values

In [None]:
special_tokens=['<|endoftext|>', '<|startoftranscript|>', '<|en|>', '<|zh|>', '<|de|>', '<|es|>', '<|ru|>', '<|ko|>', '<|fr|>', '<|ja|>', '<|pt|>', '<|tr|>', '<|pl|>', '<|ca|>', '<|nl|>', '<|ar|>', '<|sv|>', '<|it|>', '<|id|>', '<|hi|>', '<|fi|>', '<|vi|>', '<|he|>', '<|uk|>', '<|el|>', '<|ms|>', '<|cs|>', '<|ro|>', '<|da|>', '<|hu|>', '<|ta|>', '<|no|>', '<|th|>', '<|ur|>', '<|hr|>', '<|bg|>', '<|lt|>', '<|la|>', '<|mi|>', '<|ml|>', '<|cy|>', '<|sk|>', '<|te|>', '<|fa|>', '<|lv|>', '<|bn|>', '<|sr|>', '<|az|>', '<|sl|>', '<|kn|>', '<|et|>', '<|mk|>', '<|br|>', '<|eu|>', '<|is|>', '<|hy|>', '<|ne|>', '<|mn|>', '<|bs|>', '<|kk|>', '<|sq|>', '<|sw|>', '<|gl|>', '<|mr|>', '<|pa|>', '<|si|>', '<|km|>', '<|sn|>', '<|yo|>', '<|so|>', '<|af|>', '<|oc|>', '<|ka|>', '<|be|>', '<|tg|>', '<|sd|>', '<|gu|>', '<|am|>', '<|yi|>', '<|lo|>', '<|uz|>', '<|fo|>', '<|ht|>', '<|ps|>', '<|tk|>', '<|nn|>', '<|mt|>', '<|sa|>', '<|lb|>', '<|my|>', '<|bo|>', '<|tl|>', '<|mg|>', '<|as|>', '<|tt|>', '<|haw|>', '<|ln|>', '<|ha|>', '<|ba|>', '<|jw|>', '<|su|>', '<|translate|>', '<|transcribe|>', '<|startoflm|>', '<|startofprev|>', '<|nocaptions|>', '<|notimestamps|>']

dict={50257+i:special_tokens[i] for i in range(0,len(special_tokens))}
print(dict)

#### Getting Different Token ids for Different Tasks
- Right Now I've used For english language Transcribtion

In [None]:
import numpy as np
from transformers import AutoConfig, AutoProcessor


model = "openai/whisper-tiny"
config = AutoConfig.from_pretrained(model)
processor = AutoProcessor.from_pretrained(model)

# English transcription
forced_decoder_ids = processor.get_decoder_prompt_ids(language="en", task="transcribe")
print(forced_decoder_ids)
# forced_decoder_ids is of the format [(1, 50259), (2, 50359), (3, 50363)] and needs to be
# of the format [50258, 50259, 50359, 50363] where 50258 is the start token id
forced_decoder_ids = [config.decoder_start_token_id] + list(map(lambda token: token[1], forced_decoder_ids))
print(forced_decoder_ids)
# If you don't want to provide specific decoder input ids or you want
# Whisper to predict the output language and task, you can set
# forced_decoder_ids = [config.decoder_start_token_id]
# [50258]

# decoder input ids
decoder_input_ids = np.array([forced_decoder_ids], dtype=np.int32)
print(decoder_input_ids)

## Getting the tflite Model

### data preprocessing

In [None]:
from datasets import load_dataset
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import os
import numpy as np

import torch

# loading the processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
!rm -rf input_features
os.makedirs('input_features',exist_ok=True)

# load dummy dataset and read audio files
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
print(ds)
for i in range(25):
    sample = ds[i]["audio"]
    input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="np") 
    
    inp_val=input_features.input_features.astype(np.float32)
    #Need to transpose the input
    print(inp_val.shape)
    updated_inp_val=inp_val.transpose(0,2,1)
    print(updated_inp_val.shape)
    with open("input_features/inp_val_"+str(i)+".raw", 'wb') as f:
        updated_inp_val.tofile(f)


#Creating list.txt
with open("list.txt",'w') as f:
    for i in range(25):
        f.write("x.1:=input_features/inp_val_"+str(i)+".raw\n")

##### Running the Decoder Model
- Take the last_hidden_state of the encoder model
- Then take the initial decoder_input_ids then one by one add


##### Tflite Decoder Block

In [None]:
import numpy as np
import torch
import tensorflow as tf
from transformers import (
        AutoTokenizer
    )
model_id = "openai/whisper-tiny"
tokenizer = AutoTokenizer.from_pretrained(model_id)


tflite_model_path='openai-whisper/models/whisper-decoder-tiny.tflite'
#tflite_model_path='/content/whisper-decoder_main-int8.tflite'
print(tflite_model_path)

# Load the TFLite model and allocate tensors
interpreter = tf.lite.Interpreter(model_path=tflite_model_path)
interpreter.allocate_tensors()



def decoder_block_tflite(encoder_hidden_states):
    
    decoder_input_ids = torch.tensor([50258, 50259, 50359, 50363])
    decoder_input_ids = tf.expand_dims(decoder_input_ids, 0)
    
    input_tensor_1 = interpreter.get_input_details()[0]['index']
    
    interpreter.set_tensor(input_tensor_1, encoder_hidden_states)
    
    input_tensor_2 = interpreter.get_input_details()[1]['index']
    interpreter.resize_tensor_input(input_tensor_2, decoder_input_ids.shape)
    # Allocate memory for input and output tensors
    interpreter.allocate_tensors()
    interpreter.set_tensor(input_tensor_2, decoder_input_ids)
    output_tensor = interpreter.get_output_details()[0]['index']
    start_tokens = [50258, 50259, 50359, 50363] 
    tokens = start_tokens
    while(True):
        interpreter.invoke()
        output_data = interpreter.get_tensor(output_tensor) 
        # print(output_data.shape)
        # print(output_data)
        cleaned = np.argmax(output_data, axis=-1)
        # print("cleaned",cleaned)
        last_token = cleaned[0,-1]
        # print("Last Token",last_token)
        tokens.append(last_token)
        # print("Updated tokens:",tokens)
        new_value = tf.constant([last_token], dtype=tf.int64)
        new_value = tf.reshape(new_value, (1,1))
        decoder_input_ids = tf.concat([decoder_input_ids, new_value], axis=1)
        input_tensor_2 = interpreter.get_input_details()[1]['index']
        interpreter.resize_tensor_input(input_tensor_2, decoder_input_ids.shape)
        # Allocate memory for input and output tensors
        interpreter.allocate_tensors()
        interpreter.set_tensor(input_tensor_2, decoder_input_ids)
        if last_token == 50257:
          break
    
    
    return tokenizer.batch_decode(np.expand_dims(tokens, axis=0), skip_special_tokens=True)[0]
    



#### Getting the encoder DLC Model
- The above ONNX model is running fine
- Now Converting to DLC and checking how it's working

In [None]:
import os

os.environ['SNPE_ROOT']="/local/mnt/workspace/snpe/2.29.0.241129"

#### Getting the Fp32 Model

In [None]:
!pip3 install protobuf==3.20.2

In [None]:
%%bash
source $SNPE_ROOT/bin/envsetup.sh

snpe-onnx-to-dlc -i whisper_encoder.onnx -d x.1 1,80,3000 -o whisper_tiny_encoder_fp32.dlc
snpe-dlc-info -i whisper_tiny_encoder_fp32.dlc > whisper_tiny_encoder_fp32.txt
snpe-dlc-viewer -i whisper_tiny_encoder_fp32.dlc -s whisper_tiny_encoder_fp32.html 

#### Creating w8a16 model
* Choose --htp_socs based on the end device where model will be deployed. Example sm8750 or sm8650 or sm8550
* --optimizations cle --axis_quant

In [None]:
%%bash
source $SNPE_ROOT/bin/envsetup.sh

snpe-dlc-quantize --input_dlc whisper_tiny_encoder_fp32.dlc --input_list list.txt  --output_dlc whisper_tiny_encoder_w8a16.dlc --weights_bitwidth 8 --act_bitwidth 16 --enable_htp --htp_socs sm8750

snpe-dlc-info -i whisper_tiny_encoder_w8a16.dlc > whisper_tiny_encoder_w8a16.txt
snpe-dlc-viewer -i whisper_tiny_encoder_w8a16.dlc -s whisper_tiny_encoder_w8a16.html  

### Inferencing the FP32 Model on linux x86 machine

In [None]:
%%bash
rm -rf OUTPUT_Encoder

In [None]:
%%bash
source $SNPE_ROOT/bin/envsetup.sh

snpe-net-run --container whisper_tiny_encoder_fp32.dlc --input_list list.txt --output_dir OUTPUT_Encoder

#### Checking the output

In [None]:
import glob
import tensorflow as tf
import os

folder = ["OUTPUT_Encoder"]


for j in range(0,1):
    for result_path in glob.glob(os.path.join(folder[j], '*')):
        if ".log" not in result_path:
            last_hidden_state = np.fromfile(result_path+'/599.raw', dtype="float32")
            
            encoder_hidden_states=last_hidden_state.reshape((1,1500,384))
            print(decoder_block_tflite(encoder_hidden_states))

            

## Inferencing on Device

In [None]:
%%bash
adb devices

In [None]:
import os
os.environ['SNPE_ROOT']="/local/mnt/workspace/snpe/2.29.0.241129" #set up your snpe path here.
os.environ['RAW_FILE_FOLDER']="input_features"
os.environ['FOLDER_WITH_ARTIFACTS']="whisper"
os.environ['DLCFP32']="whisper_tiny_encoder_fp32.dlc"
os.environ['DLCA8W16']="whisper_tiny_encoder_w8a16.dlc"
os.environ['DLCA8W8']="whisper_tiny_encoder_w8a8.dlc"
os.environ['DLCA16W16']="whisper_tiny_encoder_w16a16.dlc"
os.environ['TARGET_INPUT_LIST']="list.txt"
os.environ['ONDEVICE_FOLDER']="whisper"
os.environ['DEVICE_HOST']="localhost"
os.environ['DEVICE_ID']="58671ff7" #fill your device-id. Use command "adb devices" to get devices names. example :"e18d5d0"
os.environ['SNPE_TARGET_ARCH']="aarch64-android"
os.environ['SNPE_TARGET_STL']="libc++_shared.so"

In [None]:
%%bash
export DEVICE_SHELL="adb -H $DEVICE_HOST -s $DEVICE_ID"
$DEVICE_SHELL shell "mkdir -p /data/local/tmp/snpeexample/$SNPE_TARGET_ARCH/bin" && $DEVICE_SHELL shell "mkdir -p /data/local/tmp/snpeexample/$SNPE_TARGET_ARCH/lib" && $DEVICE_SHELL shell "mkdir -p /data/local/tmp/snpeexample/dsp/lib"

* In below code block please use
    - v79 for sm8750
    - v75 for sm8650
    - v73 for sm8550

In [None]:
%%bash
export DEVICE_SHELL="adb -H $DEVICE_HOST -s $DEVICE_ID"
$DEVICE_SHELL push $SNPE_ROOT/lib/$SNPE_TARGET_ARCH/$SNPE_TARGET_STL /data/local/tmp/snpeexample/$SNPE_TARGET_ARCH/lib
$DEVICE_SHELL push $SNPE_ROOT/bin/$SNPE_TARGET_ARCH/snpe-net-run /data/local/tmp/snpeexample/$SNPE_TARGET_ARCH/bin
$DEVICE_SHELL push $SNPE_ROOT/lib/hexagon-v79/unsigned/*.so /data/local/tmp/snpeexample/dsp/lib
$DEVICE_SHELL push $SNPE_ROOT/lib/$SNPE_TARGET_ARCH/*.so /data/local/tmp/snpeexample/$SNPE_TARGET_ARCH/lib

In [None]:
%%bash
export DEVICE_SHELL="adb -H $DEVICE_HOST -s $DEVICE_ID"
$DEVICE_SHELL shell "mkdir -p /data/local/tmp/$ONDEVICE_FOLDER"

In [None]:
%%bash
export DEVICE_SHELL="adb -H $DEVICE_HOST -s $DEVICE_ID"
$DEVICE_SHELL push $DLCA8W16 /data/local/tmp/$ONDEVICE_FOLDER
$DEVICE_SHELL push $DLCFP32 /data/local/tmp/$ONDEVICE_FOLDER
$DEVICE_SHELL push input_features /data/local/tmp/$ONDEVICE_FOLDER
$DEVICE_SHELL push $TARGET_INPUT_LIST /data/local/tmp/$ONDEVICE_FOLDER

In [None]:
%%bash
export DEVICE_SHELL="adb -H $DEVICE_HOST -s $DEVICE_ID"
$DEVICE_SHELL shell
chmod -R 777 /data/local/tmp/snpeexample
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/snpeexample/aarch64-android/lib
export PATH=$PATH:/data/local/tmp/snpeexample/aarch64-android/bin
export OUTPUT_FOLDER=OUTPUT_32b_CPU
export OUTPUT_DLC_32=whisper_tiny_encoder_fp32.dlc
export ONDEVICE_FOLDER="whisper"
cd /data/local/tmp/$ONDEVICE_FOLDER &&
snpe-net-run --container $OUTPUT_DLC_32 --input_list list.txt   --output_dir $OUTPUT_FOLDER

In [None]:
%%bash
export DEVICE_SHELL="adb -H $DEVICE_HOST -s $DEVICE_ID"
$DEVICE_SHELL shell
chmod -R 777 /data/local/tmp/snpeexample
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/snpeexample/aarch64-android/lib
export PATH=$PATH:/data/local/tmp/snpeexample/aarch64-android/bin
export ADSP_LIBRARY_PATH="/data/local/tmp/snpeexample/dsp/lib;/system/lib/rfsa/adsp;/system/vendor/lib/rfsa/adsp;/dsp"
export OUTPUT_FOLDER=OUTPUT_DSP_W8A16
export DLC_W8A16=whisper_tiny_encoder_w8a16.dlc
export ONDEVICE_FOLDER="whisper"
cd /data/local/tmp/$ONDEVICE_FOLDER &&
snpe-net-run --container $DLC_W8A16 --input_list list.txt  --output_dir $OUTPUT_FOLDER --use_dsp --enable_cpu_fallback

In [None]:
%%bash
rm -rf OUTPUT_32b_CPU/
rm -rf OUTPUT_DSP_W8A16/

In [None]:
%%bash
export DEVICE_SHELL="adb -H $DEVICE_HOST -s $DEVICE_ID"
$DEVICE_SHELL pull /data/local/tmp/$ONDEVICE_FOLDER/OUTPUT_DSP_W8A16 OUTPUT_DSP_W8A16
$DEVICE_SHELL pull /data/local/tmp/$ONDEVICE_FOLDER/OUTPUT_32b_CPU OUTPUT_32b_CPU

### Checking the Output

In [None]:
import glob
import tensorflow as tf
import os
import numpy as np

folder = ["OUTPUT_32b_CPU","OUTPUT_DSP_W8A16"]


for j in range(0,2):
    print("------------------------------"+folder[j]+"------------------------------")
    for result_path in glob.glob(os.path.join(folder[j], '*')):
        if ".log" not in result_path:
            last_hidden_state = np.fromfile(result_path+'/599.raw', dtype="float32")
            
            encoder_hidden_states=last_hidden_state.reshape((1,1500,384))
            print(decoder_block_tflite(encoder_hidden_states))

            