In [18]:
import torch
import torchaudio
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

In [19]:
# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"

In [20]:
# Load Whisper model
processor = AutoProcessor.from_pretrained("openai/whisper-small")
model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-small")
model.to(device)

WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 768, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(768, 768, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 768)
      (layers): ModuleList(
        (0-11): 12 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=False)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        

In [33]:
# Load audio file
audio_input = "Rev.mp3"
waveform, sample_rate = torchaudio.load(audio_input)

print(sample_rate)

48000


In [23]:
# \Need to resample since whisper can mostly work with 16000
if sample_rate != 16000:
    transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = transform(waveform)
    sample_rate = 16000

In [31]:
# Preprocess audio
inputs = processor(waveform.squeeze(0), sampling_rate=sample_rate, return_tensors="pt")
inputs = {key: value.to(device) for key, value in inputs.items()}

print(inputs)

{'input_features': tensor([[[-0.6748, -0.6748, -0.6748,  ..., -0.6748, -0.6748, -0.6748],
         [-0.6748, -0.6748, -0.6748,  ..., -0.6748, -0.6748, -0.6748],
         [-0.6748, -0.6748, -0.6748,  ..., -0.6748, -0.6748, -0.6748],
         ...,
         [-0.6748, -0.6748, -0.6748,  ..., -0.6748, -0.6748, -0.6748],
         [-0.6748, -0.6748, -0.6748,  ..., -0.6748, -0.6748, -0.6748],
         [-0.6748, -0.6748, -0.6748,  ..., -0.6748, -0.6748, -0.6748]]])}


In [29]:
# Perform STT
with torch.no_grad():
  generated_ids = model.generate(**inputs, forced_decoder_ids=processor.get_decoder_prompt_ids(language="en", task="transcribe"))

print(generated_ids)

tensor([[ 1911,    11,   393,   291,   976,   385,   257,  5353,   466,  1337,
          1166,  7318,   293,   577, 25242,  2316,  1985,   293,   437,   307,
           264,  4088,  9482,   926,   309,   293,  1338,    11,  1310,   577,
          4825,    12,  8014,   338,  7318,  1985,    13]])


In [28]:
# Decode transcription
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print("Transcription:", transcription)

Transcription:  Hey, can you give me a brief about generative AI and how diffusion model works and what is the transform architecture around it and yeah, maybe how multi-model AI works.
