In [None]:
from IPython.display import Audio

import scipy.io.wavfile as wavfile
from scipy.signal import resample
import numpy as np

## Load audio file and get the correct sample rate

In [None]:
BASE_FILENAME = 'sample_patient_history'

In [None]:
# Load the audio file and get the sample rate and audio data
sample_rate, audio_data = wavfile.read(f'../sample_audio/{BASE_FILENAME}.wav')

# Ensure audio data is within range for int16
audio_data = np.clip(audio_data, -32768, 32767).astype(np.int16)

# audio_data is now a NumPy array containing the audio samples
# sample_rate contains the sample rate of the audio file

# You can perform operations on the audio data as needed
# For example, you can print the length of the audio in seconds:
audio_length_seconds = len(audio_data) / sample_rate
print(f"Audio Length: {audio_length_seconds} seconds")

In [None]:
# Display the audio in the notebook
Audio(audio_data.T, rate=sample_rate)
# Audio(audio_data, rate=sample_rate)

In [None]:
# Define the target sample rate (16,000 Hz)
target_sample_rate = 16000

# Calculate the resampling factor
resampling_factor = target_sample_rate / sample_rate

In [None]:
# Resample the audio data
resampled_audio_data = resample(audio_data, int(len(audio_data) * resampling_factor))

In [None]:
# Save the resampled audio to a new WAV file
wavfile.write(f'../sample_audio/{BASE_FILENAME}_resample.wav', target_sample_rate, np.int16(resampled_audio_data))

## Run Whisper transcription model on audio file

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

In [None]:
MODEL_ID = 'openai/whisper-large-v3'
# MODEL_ID = 'openai/whisper-large-v2'
# MODEL_ID = 'openai/whisper-tiny'

In [None]:
processor = WhisperProcessor.from_pretrained(MODEL_ID)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID)
model.config.forced_decoder_ids = None

In [None]:
# Load the audio file and get the sample rate and audio data
# sample_rate, audio_data = wavfile.read(f'../samples/{BASE_FILENAME}_resample.wav')
# sample_rate, audio_data = wavfile.read(f'../samples/output_audio_original.wav')
sample_rate, audio_data = wavfile.read(f'../sample_audio/output_audio_target_sample_rate.wav')

# audio_data is now a NumPy array containing the audio samples
# sample_rate contains the sample rate of the audio file

# You can perform operations on the audio data as needed
# For example, you can print the length of the audio in seconds:
audio_length_seconds = len(audio_data) / sample_rate
print(f"Audio Length: {audio_length_seconds} seconds")

In [None]:
# Display the audio in the notebook
Audio(audio_data.T, rate=sample_rate)
# Audio(audio_data, rate=sample_rate)

In [None]:
# input_features = processor(audio_data, sampling_rate=sample_rate, return_tensors="pt").input_features 
input_features = processor(audio_data.T, sampling_rate=sample_rate, return_tensors="pt").input_features 

In [None]:
# # generate token ids
# predicted_ids = model.generate(input_features)
# # decode token ids to text
# # transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
# transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

In [None]:
# Calculate the number of samples per chunk
samples_per_chunk = int(sample_rate * 30)

# Split the audio data into chunks
chunks = np.array_split(audio_data, np.ceil(len(audio_data) / samples_per_chunk))

transcriptions = []

for chunk in chunks:
    input_features = processor(chunk.T, sampling_rate=sample_rate, return_tensors="pt").input_features 
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    transcriptions.append(transcription[0])

# Join the transcriptions together
transcription = ' '.join(transcriptions)

In [None]:
transcription

## Run Nvidia transcription models on audio file

#### Parakeet

In [None]:
import nemo.collections.asr as nemo_asr
asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(model_name="nvidia/parakeet-tdt-1.1b")

In [None]:
asr_model.transcribe(['../sample_audio/sample_patient_history_resample.wav'])

#### Canary

In [None]:
!pip install git+https://github.com/NVIDIA/NeMo.git@r1.23.0#egg=nemo_toolkit[all]

In [None]:
from nemo.collections.asr.models import EncDecMultiTaskModel

# load model
canary_model = EncDecMultiTaskModel.from_pretrained('nvidia/canary-1b')

# update dcode params
decode_cfg = canary_model.cfg.decoding
decode_cfg.beam.beam_size = 1
canary_model.change_decoding_strategy(decode_cfg)