In [None]:
from IPython.display import Audio

import scipy.io.wavfile as wavfile
from scipy.signal import resample
import numpy as np

## Load audio file and get the correct sample rate

In [None]:
BASE_FILENAME = 'oncology1'

In [None]:
# Load the audio file and get the sample rate and audio data
sample_rate, audio_data = wavfile.read(f'../samples/{BASE_FILENAME}.wav')


# audio_data is now a NumPy array containing the audio samples
# sample_rate contains the sample rate of the audio file

# You can perform operations on the audio data as needed
# For example, you can print the length of the audio in seconds:
audio_length_seconds = len(audio_data) / sample_rate
print(f"Audio Length: {audio_length_seconds} seconds")

In [None]:
sample_rate

In [None]:
# Display the audio in the notebook
Audio(audio_data, rate=sample_rate)

In [None]:
# Define the target sample rate (16,000 Hz)
target_sample_rate = 16000

# Calculate the resampling factor
resampling_factor = target_sample_rate / sample_rate

In [None]:
# Resample the audio data
resampled_audio_data = resample(audio_data, int(len(audio_data) * resampling_factor))

In [None]:
# Save the resampled audio to a new WAV file
wavfile.write(f'../samples/{BASE_FILENAME}_resample.wav', target_sample_rate, np.int16(resampled_audio_data))

## Run transcription model on audio file

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

In [None]:
MODEL_ID = 'openai/whisper-large-v3'

In [None]:
processor = WhisperProcessor.from_pretrained(MODEL_ID)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID)
model.config.forced_decoder_ids = None

In [None]:
# Load the audio file and get the sample rate and audio data
sample_rate, audio_data = wavfile.read(f'../samples/{BASE_FILENAME}_resample.wav')

# audio_data is now a NumPy array containing the audio samples
# sample_rate contains the sample rate of the audio file

# You can perform operations on the audio data as needed
# For example, you can print the length of the audio in seconds:
audio_length_seconds = len(audio_data) / sample_rate
print(f"Audio Length: {audio_length_seconds} seconds")

In [None]:
# Display the audio in the notebook
Audio(audio_data, rate=sample_rate)

In [None]:
input_features = processor(audio_data, sampling_rate=sample_rate, return_tensors="pt").input_features 

In [None]:
# generate token ids
predicted_ids = model.generate(input_features)
# decode token ids to text
# transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

In [None]:
transcription