# OpenAI Whisper large-v3 on SageMaker

***
https://huggingface.co/openai/whisper-large-v3

https://github.com/SYSTRAN/faster-whisper
***

# setting with installing ffmpeg on notebook instance / client

In [None]:
# *****install ffmpeg on terminal *******

conda install -c conda-forge ffmpeg
ffmpeg -v
which ffmpeg
sudo ln -s /home/ec2-user/anaconda3/envs/JupyterSystemEnv/bin/ffmpeg /usr/bin/ffmpeg
sudo ln -s /home/ec2-user/anaconda3/envs/JupyterSystemEnv/bin/ffprobe /usr/bin/ffprobe


# installing libraries

In [1]:
!pip install -qU sagemaker boto3 openai-whisper pydub

In [2]:
# check ffmpeg is ready
!which ffmpeg

/usr/bin/ffmpeg


# Deploying on SageMaker endpoint

In [3]:
import sagemaker
import boto3
import json
from sagemaker.huggingface import HuggingFaceModel

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [4]:


try:
	role = sagemaker.get_execution_role()
except ValueError:
	iam = boto3.client('iam')
	role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

# Hub Model configuration. https://huggingface.co/models
hub = {
	'HF_MODEL_ID':'openai/whisper-large-v3',
	'HF_TASK':'automatic-speech-recognition'
}

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
	transformers_version='4.37.0',
	pytorch_version='2.1.0',
	py_version='py310',
	env=hub,
	role=role, 
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
	initial_instance_count=1, # number of instances
	instance_type='ml.g4dn.xlarge' # ec2 instance type
)



-----------!

In [12]:
from sagemaker.serializers import DataSerializer
from pydub import AudioSegment

predictor.serializer = DataSerializer(content_type='audio/x-audio')

AUDIO_FILE = "2.1.mp3"
sound = AudioSegment.from_file(AUDIO_FILE)
length = len(sound)
segment_count = length // 25000 + 1

print(f"{AUDIO_FILE} 's length is {len(sound)}, spilted into {segment_count} segments~")

filename = "temp.mp3"
for i in range(0,segment_count):
    start = i*25000
    if (i+1)==segment_count:
        end = length
    else:
        end = (i+1)*25000
    print([start,end])
    temp = sound[start:end]
    temp.export(filename, format="mp3")
    with open(filename, "rb") as f:
        data = f.read()
    print(predictor.predict(data))



2.1.mp3 's length is 54720, spilted into 3 segments~
[0, 25000]
{'text': " Thank you for calling technology technical support. How can I help you? Hi, I just have a quick question on a application update that I just did. After I push the task like this, it's an A80. After I push the task, you know, it's pushing to the terminal. And if I refresh and go back in, as long as it says it's succeeded, then I know there's nothing."}
[25000, 50000]
{'text': " else that needs to be done, correct? Yes. Okay, so there's nothing on the terminal. You know, if it went through correct at the terminal, the change should be made 100% in the terminal, correct? Yes, that's correct, sir. Okay, all right. I just want to make sure. I thought so. I just want to make sure. Okay, very good. Thank you so much. Okay, is there anything else I can update? No, no, that was it. Okay."}
[50000, 54720]
{'text': ' Okay, thank you for calling, Pat. Have a nice day. You too. Bye-bye.'}


In [13]:
endpoint_name = predictor.endpoint_name
print(endpoint_name)

huggingface-pytorch-inference-2024-04-18-05-41-41-954


# only for re-invoke already-created endpoint

In [20]:
#only for re-invoke already-created endpoint
import sagemaker
from sagemaker import Model, image_uris, serializers, deserializers
from sagemaker.predictor import Predictor
from sagemaker.serializers import DataSerializer
from pydub import AudioSegment

sagemaker_session = sagemaker.Session()
endpoint_name = endpoint_name#"pytorch-inference-2024-04-14-16-40-53-471"
predictor = Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sagemaker_session,
    serializer=serializers.DataSerializer(content_type='audio/x-audio'),
    deserializer=deserializers.JSONDeserializer(),
)

AUDIO_FILE = "mandarin.mp3"#"es-US-1.wav"
sound = AudioSegment.from_file(AUDIO_FILE)
length = len(sound)
segment_count = length // 25000 + 1

print(f"{AUDIO_FILE} 's length is {len(sound)}, spilted into {segment_count} segments!")

filename = "temp.mp3"
for i in range(0,segment_count):
    start = i*25000
    if (i+1)==segment_count:
        end = length
    else:
        end = (i+1)*25000
    print([start,end])
    temp = sound[start:end]
    temp.export(filename, format="mp3")
    with open(filename, "rb") as f:
        data = f.read()
    print(predictor.predict(data))



mandarin.mp3 's length is 6984, spilted into 1 segments!
[0, 6984]
{'text': '如果他们使用航空的方式运输货物,在某些航线上可能要花几天的时间才能卸货和通关。'}


# run faster_whisper in SageMaker notebook instance

In [None]:
!pip install faster_whisper

In [None]:
from faster_whisper import WhisperModel
import whisper

model = WhisperModel("large-v3")
# with open("1.mp3", "rb") as f:
#     data = f.read()

audio = whisper.load_audio("1.mp3")
print(type(audio))
segments, info = model.transcribe(audio,beam_size=5, vad_filter=True,vad_parameters=dict(min_silence_duration_ms=500))
#segments, info = model.transcribe("1.m4a",beam_size=5, vad_filter=True,vad_parameters=dict(min_silence_duration_ms=500))


for segment in segments:
    print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
