In [None]:
from IPython.display import Audio

import scipy.io.wavfile as wavfile
from scipy.signal import resample
import numpy as np

In [None]:
import torch
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

In [None]:
device

## Load audio

In [None]:
# Load the audio file and get the sample rate and audio data
sample_rate, audio_data = wavfile.read(f'../artifacts/audio/e9566c13-b385-46c1-b794-d8f419628aff.wav')

# Ensure audio data is within range for int16
audio_data = np.clip(audio_data, -32768, 32767).astype(np.int16)

# audio_data is now a NumPy array containing the audio samples
# sample_rate contains the sample rate of the audio file

# You can perform operations on the audio data as needed
# For example, you can print the length of the audio in seconds:
audio_length_seconds = len(audio_data) / sample_rate
print(f"Audio Length: {audio_length_seconds} seconds")

In [None]:
sample_rate

In [None]:
Audio(audio_data.T, rate=sample_rate)

In [None]:
# Define the target sample rate (16,000 Hz)
target_sample_rate = 16000

# Calculate the resampling factor
resampling_factor = target_sample_rate / sample_rate

In [None]:
# Resample the audio data
resampled_audio_data = resample(audio_data, int(len(audio_data) * resampling_factor))

In [None]:
Audio(resampled_audio_data.T, rate=target_sample_rate)

## Load wav2vec

In [None]:
model_id = 'facebook/wav2vec2-base-960h'

In [None]:
 processor = Wav2Vec2Processor.from_pretrained(model_id)
 model = Wav2Vec2ForCTC.from_pretrained(model_id)

#### Save model to disk

In [None]:
processor.save_pretrained(f'./processors/{model_id}')
model.save_pretrained(f'./models/{model_id}')

In [None]:
input_values = processor(resampled_audio_data.T, 
                         return_tensors="pt", 
                         padding="longest",
                         sampling_rate=target_sample_rate
                        ).input_values

In [None]:
 # retrieve logits
 logits = model(input_values).logits
 
 # take argmax and decode
 predicted_ids = torch.argmax(logits, dim=-1)
 transcription = processor.batch_decode(predicted_ids)

In [None]:
transcription

## Load wav2vec large-medical-speed

In [None]:
model_id = 'srujan00123/wav2vec2-large-medical-speed'

In [None]:
 processor = Wav2Vec2Processor.from_pretrained(model_id)
 model = Wav2Vec2ForCTC.from_pretrained(model_id)

#### Save model to disk

In [None]:
processor.save_pretrained(f'./processors/{model_id}')
model.save_pretrained(f'./models/{model_id}')

In [None]:
input_values = processor(resampled_audio_data.T, 
                         return_tensors="pt", 
                         padding="longest",
                         sampling_rate=target_sample_rate
                        ).input_values

In [None]:
 # retrieve logits
 logits = model(input_values).logits
 
 # take argmax and decode
 predicted_ids = torch.argmax(logits, dim=-1)
 transcription = processor.batch_decode(predicted_ids)

In [None]:
transcription

## Load whisper large v3

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration

In [None]:
model_id = 'openai/whisper-large-v3'

In [None]:
processor = WhisperProcessor.from_pretrained(model_id)
model = WhisperForConditionalGeneration.from_pretrained(f'../models/{model_id}')
model.config.forced_decoder_ids = None

In [None]:
input_values = processor(resampled_audio_data.T, 
                         return_tensors="pt",
                         sampling_rate=target_sample_rate
                        ).input_features

In [None]:
predicted_ids = model.generate(input_values)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

In [None]:
transcription[0]