In [7]:
# Whisper Audio Transcription in Google Colab

# 1. Install Required Dependencies
# !pip install torch transformers soundfile librosa accelerate

# 2. Import Required Libraries
import os
import sys
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa

def transcribe_audio(audio_path, model_name='openai/whisper-medium', language='english'):
    """
    Transcribe an audio file to text using Hugging Face Whisper model.
    """
    # Detect device (Colab typically provides GPU)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # Load Whisper processor and model
    processor = WhisperProcessor.from_pretrained(model_name)
    model = WhisperForConditionalGeneration.from_pretrained(model_name).to(device)

    # Load audio file
    audio, sample_rate = librosa.load(audio_path, sr=16000)

    # Prepare inputs for the model
    inputs = processor(
        audio,
        sampling_rate=sample_rate,
        return_tensors="pt"
    ).input_features.to(device)

    # Generate transcription
    with torch.no_grad():
        predicted_ids = model.generate(inputs)

    # Decode the transcription
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

    return transcription

# 3. Upload Audio File in Colab
from google.colab import files

# Upload the MP3 file
print("Please upload your MP3 file:")
uploaded = files.upload()

# Get the filename of the uploaded file
audio_filename = list(uploaded.keys())[0]

# 4. Transcribe the Audio
try:
    transcription = transcribe_audio(audio_filename)

    # 5. Save Transcription
    output_path = os.path.splitext(audio_filename)[0] + '_transcription.txt'
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(transcription)

    print(f"\nTranscription completed. Output saved to: {output_path}")
    print("\nTranscription Preview:")
    print(transcription)

    # 6. Download the transcription file
    files.download(output_path)

except Exception as e:
    print(f"An error occurred during transcription: {e}")

Please upload your MP3 file:


Saving Recordinga.mp3 to Recordinga.mp3
Using device: cuda


  audio, sample_rate = librosa.load(audio_path, sr=16000)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)



Transcription completed. Output saved to: Recordinga_transcription.txt

Transcription Preview:
 Good morning everyone, my name is John.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>