Whisper models:
* [OpenAI Whisper - installation requirements](https://github.com/openai/whisper)
* [Hugging Face Transformers - Whisper](https://huggingface.co/openai/whisper-large-v3/blob/main/README.md)

In [None]:
!pwd

In [None]:
#aa added
!pip install --upgrade pip
!pip install --upgrade transformers datasets[audio] accelerate

In [None]:
# Imports
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration, pipeline
from IPython.display import Audio

In [None]:
# aa: changed audio file path
# Config params
pipeline_id: str = "automatic-speech-recognition"
model_id: str = "openai/whisper-large-v3"
device: str = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
audio_file: str = r"./grip_audio_file.mp3"

#### Instantiate model and spreech processing pipeline
***

In [None]:
# Instantiate the model
model = WhisperForConditionalGeneration.from_pretrained(
    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
model.to(device)


In [None]:
# Instantiate processors (which includes the tokenizer and feature extractor for Whisper)
processor = WhisperProcessor.from_pretrained(model_id)

# Instantiate the pipeline
pipe = pipeline(
    pipeline_id,
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    device=device,
)

#### Run inference
***

In [None]:
# play audio
Audio(audio_file)

In [None]:
!curl https://johnvansickle.com/ffmpeg/releases/ffmpeg-release-amd64-static.tar.xz -o ffmpeg.tar.xz \
     && tar -xf ffmpeg.tar.xz && rm ffmpeg.tar.xz

In [None]:
ffmdir = !find . -iname ffmpeg-*-static
path = %env PATH
path = path + ':' + ffmdir[0]
%env PATH $path
!which ffmpeg
print('Done!')

In [None]:
%%time 
# aa: added previous %%time
# Run transcription
generate_kwargs = {
    "language": "german",
    "task": "transcribe",
    "condition_on_prev_tokens": True,
    "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
    "logprob_threshold": -1.0,
    "no_speech_threshold": 0.6,
}

# run transcription 
try:
    #aa added return_timestamps=True
    result = pipe(audio_file, generate_kwargs=generate_kwargs, return_timestamps=True)

    # play audio
    display(Audio(audio_file))

    # show transcribed text
    text = result["text"]
    print(f"Transcribed text:\n\t{text}")
    
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
%%time   
#aa: added previous %%time
# Run transcription
generate_kwargs = {
    "language": "german",
    "task": "translate",
    "condition_on_prev_tokens": True,
    "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
    "logprob_threshold": -1.0,
    "no_speech_threshold": 0.6,
}

# run transcription 
try:
    #aa added return_timestamps=True
    result = pipe(audio_file, generate_kwargs=generate_kwargs, return_timestamps=True)

    # play audio
    display(Audio(audio_file))

    # show transcribed text
    text = result["text"]
    print(f"Translated text:\n\t{text}")
    
except Exception as e:
    print(f"An error occurred: {e}")

In [None]:
#aa: convert the model to onnx

In [1]:
!pip install optimum[exporters]

Collecting timm
  Downloading timm-1.0.13-py3-none-any.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Collecting transformers>=4.29
  Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m113.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tokenizers<0.21,>=0.20
  Downloading tokenizers-0.20.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m134.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers, timm
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.0
    Uninstalling tokenizers-0.21.0:
      Successfully uninstalled tokenizers-0.21.0
  Attempting uninstall: transformers
    Found existing installation: transformer

In [5]:
!optimum-cli export onnx --help

usage: optimum-cli export onnx [-h] -m MODEL [--task TASK] [--opset OPSET]
                               [--device DEVICE] [--fp16]
                               [--dtype {fp32,fp16,bf16}]
                               [--optimize {O1,O2,O3,O4}] [--monolith]
                               [--no-post-process] [--variant VARIANT]
                               [--framework {pt,tf}] [--atol ATOL]
                               [--cache_dir CACHE_DIR] [--trust-remote-code]
                               [--pad_token_id PAD_TOKEN_ID]
                               [--library-name {transformers,diffusers,timm,sentence_transformers}]
                               [--model-kwargs MODEL_KWARGS] [--legacy]
                               [--no-dynamic-axes] [--no-constant-folding]
                               [--batch_size BATCH_SIZE]
                               [--sequence_length SEQUENCE_LENGTH]
                               [--num_choices NUM_CHOICES] [--width WIDTH]
                

In [3]:
!mkdir -p models2

In [None]:
%%time
!optimum-cli export onnx --model openai/whisper-large-v3 models/

In [6]:
!optimum-cli export onnx --model openai/whisper-small  models2

  if input_features.shape[-1] != expected_seq_length:
  if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
verbose: False, log level: Level.ERROR

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
  if sequence_length != 1:
verbose: False, log level: Level.ERROR

  or len(self.key_cache[layer_idx]) == 0  # the layer has no cache
  elif len(self.key_cache[layer_idx]) == 0:  # fills previously skipped layers; checking for tensor causes errors
verbose: False, log level: Level.ERROR

Weight deduplication check in the ONNX export requires accelerate. Please install accelerate to run it.
		-[x] values not close enough, max diff: 0.002777099609375 (atol: 0.001)
		-[x] values not close enough, max diff: 7.021236419677734 (atol: 0.001)
		-[x] values not close enough, max diff: 4.96610403060

In [7]:
#not needed pytorch image has already these packages
#!pip install boto3 botocore

In [10]:
import os
import boto3
import botocore

aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
endpoint_url = os.environ.get('AWS_S3_ENDPOINT')
region_name = os.environ.get('AWS_DEFAULT_REGION')
bucket_name = os.environ.get('AWS_S3_BUCKET')

if not all([aws_access_key_id, aws_secret_access_key, endpoint_url, region_name, bucket_name]):
   raise ValueError("One or more connection variables are empty.  "
                    "Please check your connection to an S3 bucket.")
        
session = boto3.session.Session(aws_access_key_id=aws_access_key_id,
                                aws_secret_access_key=aws_secret_access_key)
        
s3_resource = session.resource(
   's3',
   config=botocore.client.Config(signature_version='s3v4'),
   endpoint_url=endpoint_url,
   region_name=region_name)
        
bucket = s3_resource.Bucket(bucket_name)
        
        
def upload_directory_to_s3(local_directory, s3_prefix):
  num_files = 0
  for root, dirs, files in os.walk(local_directory):
     for filename in files:
        file_path = os.path.join(root, filename)
        relative_path = os.path.relpath(file_path, local_directory)
        s3_key = os.path.join(s3_prefix, relative_path)
        print(f"{file_path} -> {s3_key}")
        bucket.upload_file(file_path, s3_key)
        num_files += 1
  return num_files
                                                        
                                                        
def list_objects(prefix):
   filter = bucket.objects.filter(Prefix=prefix)
   for obj in filter.all():
      print(obj.key)

In [11]:
list_objects("models")

In [12]:
local_models_directory = "models"

if not os.path.isdir(local_models_directory):
   raise ValueError(f"The directory '{local_models_directory}' does not exist.  "
                     "Did you finish training the model in the previous notebook?")
        
num_files = upload_directory_to_s3("models", "models")
        
if num_files == 0:
   raise ValueError("No files uploaded.  Did you finish training and "
                    "saving the model to the \"models\" directory?  "
                    "Check for \"models/fraud/1/model.onnx\"")

models/config.json -> models/config.json
models/generation_config.json -> models/generation_config.json
models/tokenizer_config.json -> models/tokenizer_config.json
models/special_tokens_map.json -> models/special_tokens_map.json
models/added_tokens.json -> models/added_tokens.json
models/vocab.json -> models/vocab.json
models/merges.txt -> models/merges.txt
models/normalizer.json -> models/normalizer.json
models/tokenizer.json -> models/tokenizer.json
models/preprocessor_config.json -> models/preprocessor_config.json
models/encoder_model.onnx -> models/encoder_model.onnx
models/decoder_model.onnx -> models/decoder_model.onnx
models/decoder_with_past_model.onnx -> models/decoder_with_past_model.onnx
models/decoder_model_merged.onnx -> models/decoder_model_merged.onnx


In [13]:
list_objects("models")

models/added_tokens.json
models/config.json
models/decoder_model.onnx
models/decoder_model_merged.onnx
models/decoder_with_past_model.onnx
models/encoder_model.onnx
models/generation_config.json
models/merges.txt
models/normalizer.json
models/preprocessor_config.json
models/special_tokens_map.json
models/tokenizer.json
models/tokenizer_config.json
models/vocab.json


In [None]:
!curl -s https://cc-cc.apps.ocp4.example.com/v2/models

In [None]:
!pwd

In [None]:
from transformers import WhisperForConditionalGeneration
hf_model_id = "openai/whisper-medium" # Change to your model url
hf_model = WhisperForConditionalGeneration.from_pretrained(hf_model_id)
torch.save(merged_model, "./pretrained_model_path")

In [1]:
!pip install optimum transformers onnxruntime


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
model_id = "openai/whisper-tiny"
from optimum.onnxruntime import ORTModelForSpeechSeq2Seq
model = ORTModelForSpeechSeq2Seq.from_pretrained('onnx_variant', export=True)
model.save_pretrained('./onnx_model')

In [None]:
from transformers import WhisperForConditionalGeneration

model_name = "openai/whisper-medium"
model = WhisperForConditionalGeneration.from_pretrained(model_name)

In [None]:
model.save_pretrained('./onnx_model')

In [3]:
import os
import boto3
import botocore

aws_access_key_id = os.environ.get('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.environ.get('AWS_SECRET_ACCESS_KEY')
endpoint_url = os.environ.get('AWS_S3_ENDPOINT')
region_name = os.environ.get('AWS_DEFAULT_REGION')
bucket_name = os.environ.get('AWS_S3_BUCKET')

if not all([aws_access_key_id, aws_secret_access_key, endpoint_url, region_name, bucket_name]):
   raise ValueError("One or more connection variables are empty.  "
                    "Please check your connection to an S3 bucket.")
        
session = boto3.session.Session(aws_access_key_id=aws_access_key_id,
                                aws_secret_access_key=aws_secret_access_key)
        
s3_resource = session.resource(
   's3',
   config=botocore.client.Config(signature_version='s3v4'),
   endpoint_url=endpoint_url,
   region_name=region_name)
        
bucket = s3_resource.Bucket(bucket_name)
        
        
def upload_directory_to_s3(local_directory, s3_prefix):
  num_files = 0
  for root, dirs, files in os.walk(local_directory):
     for filename in files:
        file_path = os.path.join(root, filename)
        relative_path = os.path.relpath(file_path, local_directory)
        s3_key = os.path.join(s3_prefix, relative_path)
        print(f"{file_path} -> {s3_key}")
        bucket.upload_file(file_path, s3_key)
        num_files += 1
  return num_files
                                                        
                                                        
def list_objects(prefix):
   filter = bucket.objects.filter(Prefix=prefix)
   for obj in filter.all():
      print(obj.key)

In [4]:
list_objects("models")

In [5]:
local_models_directory = "models"

if not os.path.isdir(local_models_directory):
   raise ValueError(f"The directory '{local_models_directory}' does not exist.  "
                     "Did you finish training the model in the previous notebook?")
        
num_files = upload_directory_to_s3("models", "models")
        
if num_files == 0:
   raise ValueError("No files uploaded.  Did you finish training and "
                    "saving the model to the \"models\" directory?  "
                    "Check for \"models/fraud/1/model.onnx\"")

models/config.json -> models/config.json
models/generation_config.json -> models/generation_config.json
models/tokenizer_config.json -> models/tokenizer_config.json
models/special_tokens_map.json -> models/special_tokens_map.json
models/added_tokens.json -> models/added_tokens.json
models/vocab.json -> models/vocab.json
models/merges.txt -> models/merges.txt
models/normalizer.json -> models/normalizer.json
models/tokenizer.json -> models/tokenizer.json
models/preprocessor_config.json -> models/preprocessor_config.json
models/encoder_model.onnx -> models/encoder_model.onnx
models/decoder_model.onnx -> models/decoder_model.onnx


In [1]:
from transformers import WhisperConfig
from optimum.exporters.onnx import main_export
from optimum.exporters.onnx.model_configs import WhisperOnnxConfig

model_id = "openai/whisper-small"

print("Exporting model as ONNX")

config = WhisperConfig.from_pretrained(model_id)
onnx_config = WhisperOnnxConfig(config, task="automatic-speech-recognition")

encoder_config = onnx_config.with_behavior("encoder")
decoder_config = onnx_config.with_behavior("decoder")

custom_onnx_configs={
        "encoder_model": encoder_config,
        "decoder_model": decoder_config
}

main_export(
        model_id,
        output="models",
        task="automatic-speech-recognition",
        custom_onnx_configs=custom_onnx_configs
)




Exporting model as ONNX


config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/967M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.87k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

  if input_features.shape[-1] != expected_seq_length:
  if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):


verbose: False, log level: Level.ERROR



  if sequence_length != 1:


verbose: False, log level: Level.ERROR



Weight deduplication check in the ONNX export requires accelerate. Please install accelerate to run it.
		-[x] values not close enough, max diff: 0.01586604118347168 (atol: 0.001)
- last_hidden_state: max diff = 0.01586604118347168.
 The exported model was saved at: models
